diff options
Diffstat (limited to 'llvm/test')
40 files changed, 64067 insertions, 139 deletions
diff --git a/llvm/test/CodeGen/X86/SwizzleShuff.ll b/llvm/test/CodeGen/X86/SwizzleShuff.ll index 4019ee42679..e6519a60a4b 100644 --- a/llvm/test/CodeGen/X86/SwizzleShuff.ll +++ b/llvm/test/CodeGen/X86/SwizzleShuff.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s ; Check that we perform a scalar XOR on i32. diff --git a/llvm/test/CodeGen/X86/avx512-cvt-widen.ll b/llvm/test/CodeGen/X86/avx512-cvt-widen.ll new file mode 100644 index 00000000000..eddd0039507 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-cvt-widen.ll @@ -0,0 +1,2645 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=DQNOVL --check-prefix=AVX512DQ +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW + + +define <16 x float> @sitof32(<16 x i32> %a) nounwind { +; ALL-LABEL: sitof32: +; ALL: # %bb.0: +; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %b = sitofp <16 x i32> %a to <16 x float> + ret <16 x float> %b +} + +define <8 x double> @sltof864(<8 x i64> %a) { +; NODQ-LABEL: sltof864: +; NODQ: # %bb.0: +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: sltof864: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2pd %zmm0, %zmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: sltof864: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 +; DQNOVL-NEXT: retq + %b = sitofp <8 x i64> %a to <8 x double> + ret <8 x double> %b +} + +define <4 x double> @slto4f64(<4 x i64> %a) { +; NODQ-LABEL: slto4f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: slto4f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2pd %ymm0, %ymm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: slto4f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; DQNOVL-NEXT: retq + %b = sitofp <4 x i64> %a to <4 x double> + ret <4 x double> %b +} + +define <2 x double> @slto2f64(<2 x i64> %a) { +; NODQ-LABEL: slto2f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; NODQ-NEXT: retq +; +; VLDQ-LABEL: slto2f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: slto2f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %b = sitofp <2 x i64> %a to <2 x double> + ret <2 x double> %b +} + +define <2 x float> @sltof2f32(<2 x i64> %a) { +; NOVLDQ-LABEL: sltof2f32: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; NOVLDQ-NEXT: vmovq %xmm0, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: sltof2f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0 +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: sltof2f32: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; VLNODQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; VLNODQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: sltof2f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %b = sitofp <2 x i64> %a to <2 x float> + ret <2 x float>%b +} + +define <4 x float> @slto4f32_mem(<4 x i64>* %a) { +; NODQ-LABEL: slto4f32_mem: +; NODQ: # %bb.0: +; NODQ-NEXT: vmovdqu (%rdi), %xmm0 +; NODQ-NEXT: vmovdqu 16(%rdi), %xmm1 +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; NODQ-NEXT: retq +; +; VLDQ-LABEL: slto4f32_mem: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: slto4f32_mem: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vmovups (%rdi), %ymm0 +; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %a1 = load <4 x i64>, <4 x i64>* %a, align 8 + %b = sitofp <4 x i64> %a1 to <4 x float> + ret <4 x float>%b +} + +define <4 x i64> @f64to4sl(<4 x double> %a) { +; NODQ-LABEL: f64to4sl: +; NODQ: # %bb.0: +; NODQ-NEXT: vextractf128 $1, %ymm0, %xmm1 +; NODQ-NEXT: vcvttsd2si %xmm1, %rax +; NODQ-NEXT: vmovq %rax, %xmm2 +; NODQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; NODQ-NEXT: vcvttsd2si %xmm1, %rax +; NODQ-NEXT: vmovq %rax, %xmm1 +; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; NODQ-NEXT: vcvttsd2si %xmm0, %rax +; NODQ-NEXT: vmovq %rax, %xmm2 +; NODQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; NODQ-NEXT: vcvttsd2si %xmm0, %rax +; NODQ-NEXT: vmovq %rax, %xmm0 +; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; NODQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: f64to4sl: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttpd2qq %ymm0, %ymm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: f64to4sl: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; DQNOVL-NEXT: vcvttpd2qq %zmm0, %zmm0 +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; DQNOVL-NEXT: retq + %b = fptosi <4 x double> %a to <4 x i64> + ret <4 x i64> %b +} + +define <4 x i64> @f32to4sl(<4 x float> %a) { +; NODQ-LABEL: f32to4sl: +; NODQ: # %bb.0: +; NODQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; NODQ-NEXT: vcvttss2si %xmm1, %rax +; NODQ-NEXT: vmovq %rax, %xmm1 +; NODQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; NODQ-NEXT: vcvttss2si %xmm2, %rax +; NODQ-NEXT: vmovq %rax, %xmm2 +; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; NODQ-NEXT: vcvttss2si %xmm0, %rax +; NODQ-NEXT: vmovq %rax, %xmm2 +; NODQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; NODQ-NEXT: vcvttss2si %xmm0, %rax +; NODQ-NEXT: vmovq %rax, %xmm0 +; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; NODQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: f32to4sl: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: f32to4sl: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; DQNOVL-NEXT: vcvttps2qq %ymm0, %zmm0 +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; DQNOVL-NEXT: retq + %b = fptosi <4 x float> %a to <4 x i64> + ret <4 x i64> %b +} + +define <4 x float> @slto4f32(<4 x i64> %a) { +; NODQ-LABEL: slto4f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; NODQ-NEXT: vzeroupper +; NODQ-NEXT: retq +; +; VLDQ-LABEL: slto4f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0 +; VLDQ-NEXT: vzeroupper +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: slto4f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %b = sitofp <4 x i64> %a to <4 x float> + ret <4 x float> %b +} + +define <4 x float> @ulto4f32(<4 x i64> %a) { +; NODQ-LABEL: ulto4f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; NODQ-NEXT: vzeroupper +; NODQ-NEXT: retq +; +; VLDQ-LABEL: ulto4f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0 +; VLDQ-NEXT: vzeroupper +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: ulto4f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; DQNOVL-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %b = uitofp <4 x i64> %a to <4 x float> + ret <4 x float> %b +} + +define <8 x double> @ulto8f64(<8 x i64> %a) { +; NODQ-LABEL: ulto8f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0 +; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; NODQ-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; NODQ-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: ulto8f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: ulto8f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; DQNOVL-NEXT: retq + %b = uitofp <8 x i64> %a to <8 x double> + ret <8 x double> %b +} + +define <16 x double> @ulto16f64(<16 x i64> %a) { +; NODQ-LABEL: ulto16f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] +; NODQ-NEXT: vpandq %zmm2, %zmm0, %zmm3 +; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; NODQ-NEXT: vporq %zmm4, %zmm3, %zmm3 +; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0 +; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; NODQ-NEXT: vporq %zmm5, %zmm0, %zmm0 +; NODQ-NEXT: vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] +; NODQ-NEXT: vsubpd %zmm6, %zmm0, %zmm0 +; NODQ-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; NODQ-NEXT: vpandq %zmm2, %zmm1, %zmm2 +; NODQ-NEXT: vporq %zmm4, %zmm2, %zmm2 +; NODQ-NEXT: vpsrlq $32, %zmm1, %zmm1 +; NODQ-NEXT: vporq %zmm5, %zmm1, %zmm1 +; NODQ-NEXT: vsubpd %zmm6, %zmm1, %zmm1 +; NODQ-NEXT: vaddpd %zmm1, %zmm2, %zmm1 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: ulto16f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; VLDQ-NEXT: vcvtuqq2pd %zmm1, %zmm1 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: ulto16f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; DQNOVL-NEXT: vcvtuqq2pd %zmm1, %zmm1 +; DQNOVL-NEXT: retq + %b = uitofp <16 x i64> %a to <16 x double> + ret <16 x double> %b +} + +define <16 x i32> @f64to16si(<16 x float> %a) nounwind { +; ALL-LABEL: f64to16si: +; ALL: # %bb.0: +; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 +; ALL-NEXT: retq + %b = fptosi <16 x float> %a to <16 x i32> + ret <16 x i32> %b +} + +define <16 x i8> @f32to16sc(<16 x float> %f) { +; ALL-LABEL: f32to16sc: +; ALL: # %bb.0: +; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 +; ALL-NEXT: vpmovdb %zmm0, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %res = fptosi <16 x float> %f to <16 x i8> + ret <16 x i8> %res +} + +define <16 x i16> @f32to16ss(<16 x float> %f) { +; ALL-LABEL: f32to16ss: +; ALL: # %bb.0: +; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 +; ALL-NEXT: vpmovdw %zmm0, %ymm0 +; ALL-NEXT: retq + %res = fptosi <16 x float> %f to <16 x i16> + ret <16 x i16> %res +} + +define <16 x i32> @f32to16ui(<16 x float> %a) nounwind { +; ALL-LABEL: f32to16ui: +; ALL: # %bb.0: +; ALL-NEXT: vcvttps2udq %zmm0, %zmm0 +; ALL-NEXT: retq + %b = fptoui <16 x float> %a to <16 x i32> + ret <16 x i32> %b +} + +define <16 x i8> @f32to16uc(<16 x float> %f) { +; ALL-LABEL: f32to16uc: +; ALL: # %bb.0: +; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 +; ALL-NEXT: vpmovdb %zmm0, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %res = fptoui <16 x float> %f to <16 x i8> + ret <16 x i8> %res +} + +define <16 x i16> @f32to16us(<16 x float> %f) { +; ALL-LABEL: f32to16us: +; ALL: # %bb.0: +; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 +; ALL-NEXT: vpmovdw %zmm0, %ymm0 +; ALL-NEXT: retq + %res = fptoui <16 x float> %f to <16 x i16> + ret <16 x i16> %res +} + +define <8 x i32> @f32to8ui(<8 x float> %a) nounwind { +; NOVL-LABEL: f32to8ui: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0 +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: f32to8ui: +; VL: # %bb.0: +; VL-NEXT: vcvttps2udq %ymm0, %ymm0 +; VL-NEXT: retq + %b = fptoui <8 x float> %a to <8 x i32> + ret <8 x i32> %b +} + +define <4 x i32> @f32to4ui(<4 x float> %a) nounwind { +; NOVL-LABEL: f32to4ui: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0 +; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq +; +; VL-LABEL: f32to4ui: +; VL: # %bb.0: +; VL-NEXT: vcvttps2udq %xmm0, %xmm0 +; VL-NEXT: retq + %b = fptoui <4 x float> %a to <4 x i32> + ret <4 x i32> %b +} + +define <8 x i32> @f64to8ui(<8 x double> %a) nounwind { +; ALL-LABEL: f64to8ui: +; ALL: # %bb.0: +; ALL-NEXT: vcvttpd2udq %zmm0, %ymm0 +; ALL-NEXT: retq + %b = fptoui <8 x double> %a to <8 x i32> + ret <8 x i32> %b +} + +define <8 x i16> @f64to8us(<8 x double> %f) { +; NOVL-LABEL: f64to8us: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVL-NEXT: vpmovdw %zmm0, %ymm0 +; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq +; +; VL-LABEL: f64to8us: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VL-NEXT: vpmovdw %ymm0, %xmm0 +; VL-NEXT: vzeroupper +; VL-NEXT: retq + %res = fptoui <8 x double> %f to <8 x i16> + ret <8 x i16> %res +} + +define <8 x i8> @f64to8uc(<8 x double> %f) { +; NOVL-LABEL: f64to8uc: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVL-NEXT: vpmovdb %zmm0, %xmm0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq +; +; VL-LABEL: f64to8uc: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VL-NEXT: vpmovdb %ymm0, %xmm0 +; VL-NEXT: vzeroupper +; VL-NEXT: retq + %res = fptoui <8 x double> %f to <8 x i8> + ret <8 x i8> %res +} + +define <4 x i32> @f64to4ui(<4 x double> %a) nounwind { +; NOVL-LABEL: f64to4ui: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0 +; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq +; +; VL-LABEL: f64to4ui: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2udq %ymm0, %xmm0 +; VL-NEXT: vzeroupper +; VL-NEXT: retq + %b = fptoui <4 x double> %a to <4 x i32> + ret <4 x i32> %b +} + +define <8 x double> @sito8f64(<8 x i32> %a) { +; ALL-LABEL: sito8f64: +; ALL: # %bb.0: +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %b = sitofp <8 x i32> %a to <8 x double> + ret <8 x double> %b +} +define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind { +; KNL-LABEL: i32to8f64_mask: +; KNL: # %bb.0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} +; KNL-NEXT: retq +; +; VLBW-LABEL: i32to8f64_mask: +; VLBW: # %bb.0: +; VLBW-NEXT: kmovd %edi, %k1 +; VLBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} +; VLBW-NEXT: retq +; +; VLNOBW-LABEL: i32to8f64_mask: +; VLNOBW: # %bb.0: +; VLNOBW-NEXT: kmovw %edi, %k1 +; VLNOBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} +; VLNOBW-NEXT: retq +; +; DQNOVL-LABEL: i32to8f64_mask: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: kmovw %edi, %k1 +; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} +; DQNOVL-NEXT: retq +; +; AVX512BW-LABEL: i32to8f64_mask: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} +; AVX512BW-NEXT: retq + %1 = bitcast i8 %c to <8 x i1> + %2 = sitofp <8 x i32> %b to <8 x double> + %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a + ret <8 x double> %3 +} +define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { +; KNL-LABEL: sito8f64_maskz: +; KNL: # %bb.0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; VLBW-LABEL: sito8f64_maskz: +; VLBW: # %bb.0: +; VLBW-NEXT: kmovd %edi, %k1 +; VLBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} +; VLBW-NEXT: retq +; +; VLNOBW-LABEL: sito8f64_maskz: +; VLNOBW: # %bb.0: +; VLNOBW-NEXT: kmovw %edi, %k1 +; VLNOBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} +; VLNOBW-NEXT: retq +; +; DQNOVL-LABEL: sito8f64_maskz: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: kmovw %edi, %k1 +; DQNOVL-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} +; DQNOVL-NEXT: retq +; +; AVX512BW-LABEL: sito8f64_maskz: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq + %1 = bitcast i8 %b to <8 x i1> + %2 = sitofp <8 x i32> %a to <8 x double> + %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer + ret <8 x double> %3 +} + +define <8 x i32> @f64to8si(<8 x double> %a) { +; ALL-LABEL: f64to8si: +; ALL: # %bb.0: +; ALL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; ALL-NEXT: retq + %b = fptosi <8 x double> %a to <8 x i32> + ret <8 x i32> %b +} + +define <8 x i16> @f64to8ss(<8 x double> %f) { +; NOVL-LABEL: f64to8ss: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVL-NEXT: vpmovdw %zmm0, %ymm0 +; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq +; +; VL-LABEL: f64to8ss: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VL-NEXT: vpmovdw %ymm0, %xmm0 +; VL-NEXT: vzeroupper +; VL-NEXT: retq + %res = fptosi <8 x double> %f to <8 x i16> + ret <8 x i16> %res +} + +define <8 x i8> @f64to8sc(<8 x double> %f) { +; NOVL-LABEL: f64to8sc: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVL-NEXT: vpmovdb %zmm0, %xmm0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq +; +; VL-LABEL: f64to8sc: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VL-NEXT: vpmovdb %ymm0, %xmm0 +; VL-NEXT: vzeroupper +; VL-NEXT: retq + %res = fptosi <8 x double> %f to <8 x i8> + ret <8 x i8> %res +} + +define <4 x i32> @f64to4si(<4 x double> %a) { +; ALL-LABEL: f64to4si: +; ALL: # %bb.0: +; ALL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %b = fptosi <4 x double> %a to <4 x i32> + ret <4 x i32> %b +} + +define <16 x float> @f64to16f32(<16 x double> %b) nounwind { +; ALL-LABEL: f64to16f32: +; ALL: # %bb.0: +; ALL-NEXT: vcvtpd2ps %zmm0, %ymm0 +; ALL-NEXT: vcvtpd2ps %zmm1, %ymm1 +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %a = fptrunc <16 x double> %b to <16 x float> + ret <16 x float> %a +} + +define <4 x float> @f64to4f32(<4 x double> %b) { +; ALL-LABEL: f64to4f32: +; ALL: # %bb.0: +; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %a = fptrunc <4 x double> %b to <4 x float> + ret <4 x float> %a +} + +define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) { +; NOVLDQ-LABEL: f64to4f32_mask: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vpslld $31, %xmm1, %xmm1 +; NOVLDQ-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NOVLDQ-NEXT: vcvtpd2ps %ymm0, %xmm0 +; NOVLDQ-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: f64to4f32_mask: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vpslld $31, %xmm1, %xmm1 +; VLDQ-NEXT: vpmovd2m %xmm1, %k1 +; VLDQ-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} +; VLDQ-NEXT: vzeroupper +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: f64to4f32_mask: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vpslld $31, %xmm1, %xmm1 +; VLNODQ-NEXT: vptestmd %xmm1, %xmm1, %k1 +; VLNODQ-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vzeroupper +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: f64to4f32_mask: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vpslld $31, %xmm1, %xmm1 +; DQNOVL-NEXT: vpmovd2m %zmm1, %k1 +; DQNOVL-NEXT: vcvtpd2ps %ymm0, %xmm0 +; DQNOVL-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %a = fptrunc <4 x double> %b to <4 x float> + %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer + ret <4 x float> %c +} + +define <4 x float> @f64tof32_inreg(<2 x double> %a0, <4 x float> %a1) nounwind { +; ALL-LABEL: f64tof32_inreg: +; ALL: # %bb.0: +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 +; ALL-NEXT: retq + %ext = extractelement <2 x double> %a0, i32 0 + %cvt = fptrunc double %ext to float + %res = insertelement <4 x float> %a1, float %cvt, i32 0 + ret <4 x float> %res +} + +define <8 x double> @f32to8f64(<8 x float> %b) nounwind { +; ALL-LABEL: f32to8f64: +; ALL: # %bb.0: +; ALL-NEXT: vcvtps2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %a = fpext <8 x float> %b to <8 x double> + ret <8 x double> %a +} + +define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x double> %a1) { +; NOVL-LABEL: f32to4f64_mask: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 +; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; NOVL-NEXT: vcvtps2pd %xmm0, %ymm0 +; NOVL-NEXT: vcmpltpd %zmm2, %zmm1, %k1 +; NOVL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: f32to4f64_mask: +; VL: # %bb.0: +; VL-NEXT: vcmpltpd %ymm2, %ymm1, %k1 +; VL-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} +; VL-NEXT: retq + %a = fpext <4 x float> %b to <4 x double> + %mask = fcmp ogt <4 x double> %a1, %b1 + %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer + ret <4 x double> %c +} + +define <4 x double> @f32to4f64_mask_load(<4 x float>* %p, <4 x double> %b1, <4 x double> %a1) { +; NOVL-LABEL: f32to4f64_mask_load: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; NOVL-NEXT: vcvtps2pd (%rdi), %ymm2 +; NOVL-NEXT: vcmpltpd %zmm1, %zmm0, %k1 +; NOVL-NEXT: vmovapd %zmm2, %zmm0 {%k1} {z} +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: f32to4f64_mask_load: +; VL: # %bb.0: +; VL-NEXT: vcmpltpd %ymm1, %ymm0, %k1 +; VL-NEXT: vcvtps2pd (%rdi), %ymm0 {%k1} {z} +; VL-NEXT: retq + %b = load <4 x float>, <4 x float>* %p + %a = fpext <4 x float> %b to <4 x double> + %mask = fcmp ogt <4 x double> %a1, %b1 + %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer + ret <4 x double> %c +} + +define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind { +; ALL-LABEL: f32tof64_inreg: +; ALL: # %bb.0: +; ALL-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: retq + %ext = extractelement <4 x float> %a1, i32 0 + %cvt = fpext float %ext to double + %res = insertelement <2 x double> %a0, double %cvt, i32 0 + ret <2 x double> %res +} + +define double @sltof64_load(i64* nocapture %e) { +; ALL-LABEL: sltof64_load: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 +; ALL-NEXT: retq +entry: + %tmp1 = load i64, i64* %e, align 8 + %conv = sitofp i64 %tmp1 to double + ret double %conv +} + +define double @sitof64_load(i32* %e) { +; ALL-LABEL: sitof64_load: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 +; ALL-NEXT: retq +entry: + %tmp1 = load i32, i32* %e, align 4 + %conv = sitofp i32 %tmp1 to double + ret double %conv +} + +define float @sitof32_load(i32* %e) { +; ALL-LABEL: sitof32_load: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 +; ALL-NEXT: retq +entry: + %tmp1 = load i32, i32* %e, align 4 + %conv = sitofp i32 %tmp1 to float + ret float %conv +} + +define float @sltof32_load(i64* %e) { +; ALL-LABEL: sltof32_load: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 +; ALL-NEXT: retq +entry: + %tmp1 = load i64, i64* %e, align 8 + %conv = sitofp i64 %tmp1 to float + ret float %conv +} + +define void @f32tof64_loadstore() { +; ALL-LABEL: f32tof64_loadstore: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) +; ALL-NEXT: retq +entry: + %f = alloca float, align 4 + %d = alloca double, align 8 + %tmp = load float, float* %f, align 4 + %conv = fpext float %tmp to double + store double %conv, double* %d, align 8 + ret void +} + +define void @f64tof32_loadstore() nounwind uwtable { +; ALL-LABEL: f64tof32_loadstore: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) +; ALL-NEXT: retq +entry: + %f = alloca float, align 4 + %d = alloca double, align 8 + %tmp = load double, double* %d, align 8 + %conv = fptrunc double %tmp to float + store float %conv, float* %f, align 4 + ret void +} + +define double @long_to_double(i64 %x) { +; ALL-LABEL: long_to_double: +; ALL: # %bb.0: +; ALL-NEXT: vmovq %rdi, %xmm0 +; ALL-NEXT: retq + %res = bitcast i64 %x to double + ret double %res +} + +define i64 @double_to_long(double %x) { +; ALL-LABEL: double_to_long: +; ALL: # %bb.0: +; ALL-NEXT: vmovq %xmm0, %rax +; ALL-NEXT: retq + %res = bitcast double %x to i64 + ret i64 %res +} + +define float @int_to_float(i32 %x) { +; ALL-LABEL: int_to_float: +; ALL: # %bb.0: +; ALL-NEXT: vmovd %edi, %xmm0 +; ALL-NEXT: retq + %res = bitcast i32 %x to float + ret float %res +} + +define i32 @float_to_int(float %x) { +; ALL-LABEL: float_to_int: +; ALL: # %bb.0: +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: retq + %res = bitcast float %x to i32 + ret i32 %res +} + +define <16 x double> @uito16f64(<16 x i32> %a) nounwind { +; ALL-LABEL: uito16f64: +; ALL: # %bb.0: +; ALL-NEXT: vcvtudq2pd %ymm0, %zmm2 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vcvtudq2pd %ymm0, %zmm1 +; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: retq + %b = uitofp <16 x i32> %a to <16 x double> + ret <16 x double> %b +} + +define <8 x float> @slto8f32(<8 x i64> %a) { +; NODQ-LABEL: slto8f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: slto8f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: slto8f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: retq + %b = sitofp <8 x i64> %a to <8 x float> + ret <8 x float> %b +} + +define <16 x float> @slto16f32(<16 x i64> %a) { +; NODQ-LABEL: slto16f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: slto16f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; VLDQ-NEXT: vcvtqq2ps %zmm1, %ymm1 +; VLDQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: slto16f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: vcvtqq2ps %zmm1, %ymm1 +; DQNOVL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; DQNOVL-NEXT: retq + %b = sitofp <16 x i64> %a to <16 x float> + ret <16 x float> %b +} + +define <8 x double> @slto8f64(<8 x i64> %a) { +; NODQ-LABEL: slto8f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: slto8f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2pd %zmm0, %zmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: slto8f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 +; DQNOVL-NEXT: retq + %b = sitofp <8 x i64> %a to <8 x double> + ret <8 x double> %b +} + +define <16 x double> @slto16f64(<16 x i64> %a) { +; NODQ-LABEL: slto16f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm2 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm1 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: slto16f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2pd %zmm0, %zmm0 +; VLDQ-NEXT: vcvtqq2pd %zmm1, %zmm1 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: slto16f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 +; DQNOVL-NEXT: vcvtqq2pd %zmm1, %zmm1 +; DQNOVL-NEXT: retq + %b = sitofp <16 x i64> %a to <16 x double> + ret <16 x double> %b +} + +define <8 x float> @ulto8f32(<8 x i64> %a) { +; NODQ-LABEL: ulto8f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: ulto8f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: ulto8f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: retq + %b = uitofp <8 x i64> %a to <8 x float> + ret <8 x float> %b +} + +define <16 x float> @ulto16f32(<16 x i64> %a) { +; NODQ-LABEL: ulto16f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: ulto16f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; VLDQ-NEXT: vcvtuqq2ps %zmm1, %ymm1 +; VLDQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: ulto16f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: vcvtuqq2ps %zmm1, %ymm1 +; DQNOVL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; DQNOVL-NEXT: retq + %b = uitofp <16 x i64> %a to <16 x float> + ret <16 x float> %b +} + +define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind { +; KNL-LABEL: uito8f64_mask: +; KNL: # %bb.0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} +; KNL-NEXT: retq +; +; VLBW-LABEL: uito8f64_mask: +; VLBW: # %bb.0: +; VLBW-NEXT: kmovd %edi, %k1 +; VLBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} +; VLBW-NEXT: retq +; +; VLNOBW-LABEL: uito8f64_mask: +; VLNOBW: # %bb.0: +; VLNOBW-NEXT: kmovw %edi, %k1 +; VLNOBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} +; VLNOBW-NEXT: retq +; +; DQNOVL-LABEL: uito8f64_mask: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: kmovw %edi, %k1 +; DQNOVL-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} +; DQNOVL-NEXT: retq +; +; AVX512BW-LABEL: uito8f64_mask: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} +; AVX512BW-NEXT: retq + %1 = bitcast i8 %c to <8 x i1> + %2 = uitofp <8 x i32> %b to <8 x double> + %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a + ret <8 x double> %3 +} +define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { +; KNL-LABEL: uito8f64_maskz: +; KNL: # %bb.0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; VLBW-LABEL: uito8f64_maskz: +; VLBW: # %bb.0: +; VLBW-NEXT: kmovd %edi, %k1 +; VLBW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} +; VLBW-NEXT: retq +; +; VLNOBW-LABEL: uito8f64_maskz: +; VLNOBW: # %bb.0: +; VLNOBW-NEXT: kmovw %edi, %k1 +; VLNOBW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} +; VLNOBW-NEXT: retq +; +; DQNOVL-LABEL: uito8f64_maskz: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: kmovw %edi, %k1 +; DQNOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} +; DQNOVL-NEXT: retq +; +; AVX512BW-LABEL: uito8f64_maskz: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq + %1 = bitcast i8 %b to <8 x i1> + %2 = uitofp <8 x i32> %a to <8 x double> + %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer + ret <8 x double> %3 +} + +define <4 x double> @uito4f64(<4 x i32> %a) nounwind { +; NOVL-LABEL: uito4f64: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: uito4f64: +; VL: # %bb.0: +; VL-NEXT: vcvtudq2pd %xmm0, %ymm0 +; VL-NEXT: retq + %b = uitofp <4 x i32> %a to <4 x double> + ret <4 x double> %b +} + +define <16 x float> @uito16f32(<16 x i32> %a) nounwind { +; ALL-LABEL: uito16f32: +; ALL: # %bb.0: +; ALL-NEXT: vcvtudq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %b = uitofp <16 x i32> %a to <16 x float> + ret <16 x float> %b +} + +define <8 x double> @uito8f64(<8 x i32> %a) { +; ALL-LABEL: uito8f64: +; ALL: # %bb.0: +; ALL-NEXT: vcvtudq2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %b = uitofp <8 x i32> %a to <8 x double> + ret <8 x double> %b +} + +define <8 x float> @uito8f32(<8 x i32> %a) nounwind { +; NOVL-LABEL: uito8f32: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0 +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: uito8f32: +; VL: # %bb.0: +; VL-NEXT: vcvtudq2ps %ymm0, %ymm0 +; VL-NEXT: retq + %b = uitofp <8 x i32> %a to <8 x float> + ret <8 x float> %b +} + +define <4 x float> @uito4f32(<4 x i32> %a) nounwind { +; NOVL-LABEL: uito4f32: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0 +; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq +; +; VL-LABEL: uito4f32: +; VL: # %bb.0: +; VL-NEXT: vcvtudq2ps %xmm0, %xmm0 +; VL-NEXT: retq + %b = uitofp <4 x i32> %a to <4 x float> + ret <4 x float> %b +} + +define i32 @fptosi(float %a) nounwind { +; ALL-LABEL: fptosi: +; ALL: # %bb.0: +; ALL-NEXT: vcvttss2si %xmm0, %eax +; ALL-NEXT: retq + %b = fptosi float %a to i32 + ret i32 %b +} + +define i32 @fptoui(float %a) nounwind { +; ALL-LABEL: fptoui: +; ALL: # %bb.0: +; ALL-NEXT: vcvttss2usi %xmm0, %eax +; ALL-NEXT: retq + %b = fptoui float %a to i32 + ret i32 %b +} + +define float @uitof32(i32 %a) nounwind { +; ALL-LABEL: uitof32: +; ALL: # %bb.0: +; ALL-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; ALL-NEXT: retq + %b = uitofp i32 %a to float + ret float %b +} + +define double @uitof64(i32 %a) nounwind { +; ALL-LABEL: uitof64: +; ALL: # %bb.0: +; ALL-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 +; ALL-NEXT: retq + %b = uitofp i32 %a to double + ret double %b +} + +define <16 x float> @sbto16f32(<16 x i32> %a) { +; NODQ-LABEL: sbto16f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NODQ-NEXT: vcvtdq2ps %zmm0, %zmm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: sbto16f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vpmovd2m %zmm0, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %zmm0 +; VLDQ-NEXT: vcvtdq2ps %zmm0, %zmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: sbto16f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vpmovd2m %zmm0, %k0 +; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 +; DQNOVL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; DQNOVL-NEXT: retq + %mask = icmp slt <16 x i32> %a, zeroinitializer + %1 = sitofp <16 x i1> %mask to <16 x float> + ret <16 x float> %1 +} + +define <16 x float> @scto16f32(<16 x i8> %a) { +; ALL-LABEL: scto16f32: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbd %xmm0, %zmm0 +; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %1 = sitofp <16 x i8> %a to <16 x float> + ret <16 x float> %1 +} + +define <16 x float> @ssto16f32(<16 x i16> %a) { +; ALL-LABEL: ssto16f32: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxwd %ymm0, %zmm0 +; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %1 = sitofp <16 x i16> %a to <16 x float> + ret <16 x float> %1 +} + +define <8 x double> @ssto16f64(<8 x i16> %a) { +; ALL-LABEL: ssto16f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxwd %xmm0, %ymm0 +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %1 = sitofp <8 x i16> %a to <8 x double> + ret <8 x double> %1 +} + +define <8 x double> @scto8f64(<8 x i8> %a) { +; ALL-LABEL: scto8f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbd %xmm0, %ymm0 +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %1 = sitofp <8 x i8> %a to <8 x double> + ret <8 x double> %1 +} + +define <16 x double> @scto16f64(<16 x i8> %a) { +; ALL-LABEL: scto16f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbd %xmm0, %zmm1 +; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 +; ALL-NEXT: retq + %b = sitofp <16 x i8> %a to <16 x double> + ret <16 x double> %b +} + +define <16 x double> @sbto16f64(<16 x double> %a) { +; NODQ-LABEL: sbto16f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; NODQ-NEXT: vcmpltpd %zmm0, %zmm2, %k0 +; NODQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1 +; NODQ-NEXT: kunpckbw %k0, %k1, %k1 +; NODQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm0 +; NODQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm1 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: sbto16f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; VLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k0 +; VLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1 +; VLDQ-NEXT: kunpckbw %k0, %k1, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %zmm1 +; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm0 +; VLDQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: sbto16f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; DQNOVL-NEXT: vcmpltpd %zmm0, %zmm2, %k0 +; DQNOVL-NEXT: vcmpltpd %zmm1, %zmm2, %k1 +; DQNOVL-NEXT: kunpckbw %k0, %k1, %k0 +; DQNOVL-NEXT: vpmovm2d %k0, %zmm1 +; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm0 +; DQNOVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm1 +; DQNOVL-NEXT: retq + %cmpres = fcmp ogt <16 x double> %a, zeroinitializer + %1 = sitofp <16 x i1> %cmpres to <16 x double> + ret <16 x double> %1 +} + +define <8 x double> @sbto8f64(<8 x double> %a) { +; NOVLDQ-LABEL: sbto8f64: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1 +; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: sbto8f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; VLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %ymm0 +; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: sbto8f64: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; VLNODQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1 +; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0 +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: sbto8f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; DQNOVL-NEXT: vcmpltpd %zmm0, %zmm1, %k0 +; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 +; DQNOVL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; DQNOVL-NEXT: retq + %cmpres = fcmp ogt <8 x double> %a, zeroinitializer + %1 = sitofp <8 x i1> %cmpres to <8 x double> + ret <8 x double> %1 +} + +define <8 x float> @sbto8f32(<8 x float> %a) { +; ALL-LABEL: sbto8f32: +; ALL: # %bb.0: +; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; ALL-NEXT: vcvtdq2ps %ymm0, %ymm0 +; ALL-NEXT: retq + %cmpres = fcmp ogt <8 x float> %a, zeroinitializer + %1 = sitofp <8 x i1> %cmpres to <8 x float> + ret <8 x float> %1 +} + +define <4 x float> @sbto4f32(<4 x float> %a) { +; ALL-LABEL: sbto4f32: +; ALL: # %bb.0: +; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 +; ALL-NEXT: vcvtdq2ps %xmm0, %xmm0 +; ALL-NEXT: retq + %cmpres = fcmp ogt <4 x float> %a, zeroinitializer + %1 = sitofp <4 x i1> %cmpres to <4 x float> + ret <4 x float> %1 +} + +define <4 x double> @sbto4f64(<4 x double> %a) { +; NOVL-LABEL: sbto4f64: +; NOVL: # %bb.0: +; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; NOVL-NEXT: vpmovqd %zmm0, %ymm0 +; NOVL-NEXT: vcvtdq2pd %xmm0, %ymm0 +; NOVL-NEXT: retq +; +; VLDQ-LABEL: sbto4f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; VLDQ-NEXT: vcmpltpd %ymm0, %ymm1, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %xmm0 +; VLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0 +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: sbto4f64: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; VLNODQ-NEXT: vcmpltpd %ymm0, %ymm1, %k1 +; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vcvtdq2pd %xmm0, %ymm0 +; VLNODQ-NEXT: retq + %cmpres = fcmp ogt <4 x double> %a, zeroinitializer + %1 = sitofp <4 x i1> %cmpres to <4 x double> + ret <4 x double> %1 +} + +define <2 x float> @sbto2f32(<2 x float> %a) { +; ALL-LABEL: sbto2f32: +; ALL: # %bb.0: +; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 +; ALL-NEXT: vcvtdq2ps %xmm0, %xmm0 +; ALL-NEXT: retq + %cmpres = fcmp ogt <2 x float> %a, zeroinitializer + %1 = sitofp <2 x i1> %cmpres to <2 x float> + ret <2 x float> %1 +} + +define <2 x double> @sbto2f64(<2 x double> %a) { +; NOVL-LABEL: sbto2f64: +; NOVL: # %bb.0: +; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; NOVL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; NOVL-NEXT: retq +; +; VLDQ-LABEL: sbto2f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %xmm0 +; VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: sbto2f64: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1 +; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VLNODQ-NEXT: retq + %cmpres = fcmp ogt <2 x double> %a, zeroinitializer + %1 = sitofp <2 x i1> %cmpres to <2 x double> + ret <2 x double> %1 +} + +define <16 x float> @ucto16f32(<16 x i8> %a) { +; ALL-LABEL: ucto16f32: +; ALL: # %bb.0: +; ALL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %b = uitofp <16 x i8> %a to <16 x float> + ret <16 x float>%b +} + +define <8 x double> @ucto8f64(<8 x i8> %a) { +; ALL-LABEL: ucto8f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %b = uitofp <8 x i8> %a to <8 x double> + ret <8 x double> %b +} + +define <16 x float> @swto16f32(<16 x i16> %a) { +; ALL-LABEL: swto16f32: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxwd %ymm0, %zmm0 +; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %b = sitofp <16 x i16> %a to <16 x float> + ret <16 x float> %b +} + +define <8 x double> @swto8f64(<8 x i16> %a) { +; ALL-LABEL: swto8f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxwd %xmm0, %ymm0 +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %b = sitofp <8 x i16> %a to <8 x double> + ret <8 x double> %b +} + +define <16 x double> @swto16f64(<16 x i16> %a) { +; ALL-LABEL: swto16f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxwd %ymm0, %zmm1 +; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 +; ALL-NEXT: retq + %b = sitofp <16 x i16> %a to <16 x double> + ret <16 x double> %b +} + +define <16 x double> @ucto16f64(<16 x i8> %a) { +; ALL-LABEL: ucto16f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 +; ALL-NEXT: retq + %b = uitofp <16 x i8> %a to <16 x double> + ret <16 x double> %b +} + +define <16 x float> @uwto16f32(<16 x i16> %a) { +; ALL-LABEL: uwto16f32: +; ALL: # %bb.0: +; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %b = uitofp <16 x i16> %a to <16 x float> + ret <16 x float> %b +} + +define <8 x double> @uwto8f64(<8 x i16> %a) { +; ALL-LABEL: uwto8f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %b = uitofp <8 x i16> %a to <8 x double> + ret <8 x double> %b +} + +define <16 x double> @uwto16f64(<16 x i16> %a) { +; ALL-LABEL: uwto16f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 +; ALL-NEXT: retq + %b = uitofp <16 x i16> %a to <16 x double> + ret <16 x double> %b +} + +define <16 x float> @sito16f32(<16 x i32> %a) { +; ALL-LABEL: sito16f32: +; ALL: # %bb.0: +; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %b = sitofp <16 x i32> %a to <16 x float> + ret <16 x float> %b +} + +define <16 x double> @sito16f64(<16 x i32> %a) { +; ALL-LABEL: sito16f64: +; ALL: # %bb.0: +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm2 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm1 +; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: retq + %b = sitofp <16 x i32> %a to <16 x double> + ret <16 x double> %b +} + +define <16 x float> @usto16f32(<16 x i16> %a) { +; ALL-LABEL: usto16f32: +; ALL: # %bb.0: +; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %b = uitofp <16 x i16> %a to <16 x float> + ret <16 x float> %b +} + +define <16 x float> @ubto16f32(<16 x i32> %a) { +; NODQ-LABEL: ubto16f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NODQ-NEXT: vpsrld $31, %zmm0, %zmm0 +; NODQ-NEXT: vcvtdq2ps %zmm0, %zmm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: ubto16f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vpmovd2m %zmm0, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %zmm0 +; VLDQ-NEXT: vpsrld $31, %zmm0, %zmm0 +; VLDQ-NEXT: vcvtdq2ps %zmm0, %zmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: ubto16f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vpmovd2m %zmm0, %k0 +; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 +; DQNOVL-NEXT: vpsrld $31, %zmm0, %zmm0 +; DQNOVL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; DQNOVL-NEXT: retq + %mask = icmp slt <16 x i32> %a, zeroinitializer + %1 = uitofp <16 x i1> %mask to <16 x float> + ret <16 x float> %1 +} + +define <16 x double> @ubto16f64(<16 x i32> %a) { +; NODQ-LABEL: ubto16f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NODQ-NEXT: vpsrld $31, %zmm0, %zmm1 +; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm0 +; NODQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm1 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: ubto16f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vpmovd2m %zmm0, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %zmm0 +; VLDQ-NEXT: vpsrld $31, %zmm0, %zmm1 +; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm0 +; VLDQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: ubto16f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vpmovd2m %zmm0, %k0 +; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 +; DQNOVL-NEXT: vpsrld $31, %zmm0, %zmm1 +; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm0 +; DQNOVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm1 +; DQNOVL-NEXT: retq + %mask = icmp slt <16 x i32> %a, zeroinitializer + %1 = uitofp <16 x i1> %mask to <16 x double> + ret <16 x double> %1 +} + +define <8 x float> @ubto8f32(<8 x i32> %a) { +; NOVL-LABEL: ubto8f32: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216] +; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; NOVL-NEXT: retq +; +; VL-LABEL: ubto8f32: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; VL-NEXT: retq + %mask = icmp slt <8 x i32> %a, zeroinitializer + %1 = uitofp <8 x i1> %mask to <8 x float> + ret <8 x float> %1 +} + +define <8 x double> @ubto8f64(<8 x i32> %a) { +; ALL-LABEL: ubto8f64: +; ALL: # %bb.0: +; ALL-NEXT: vpsrld $31, %ymm0, %ymm0 +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %mask = icmp slt <8 x i32> %a, zeroinitializer + %1 = uitofp <8 x i1> %mask to <8 x double> + ret <8 x double> %1 +} + +define <4 x float> @ubto4f32(<4 x i32> %a) { +; NOVL-LABEL: ubto4f32: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] +; NOVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: ubto4f32: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; VL-NEXT: retq + %mask = icmp slt <4 x i32> %a, zeroinitializer + %1 = uitofp <4 x i1> %mask to <4 x float> + ret <4 x float> %1 +} + +define <4 x double> @ubto4f64(<4 x i32> %a) { +; ALL-LABEL: ubto4f64: +; ALL: # %bb.0: +; ALL-NEXT: vpsrld $31, %xmm0, %xmm0 +; ALL-NEXT: vcvtdq2pd %xmm0, %ymm0 +; ALL-NEXT: retq + %mask = icmp slt <4 x i32> %a, zeroinitializer + %1 = uitofp <4 x i1> %mask to <4 x double> + ret <4 x double> %1 +} + +define <2 x float> @ubto2f32(<2 x i32> %a) { +; NOVL-LABEL: ubto2f32: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] +; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: ubto2f32: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; VL-NEXT: retq + %mask = icmp ne <2 x i32> %a, zeroinitializer + %1 = uitofp <2 x i1> %mask to <2 x float> + ret <2 x float> %1 +} + +define <2 x double> @ubto2f64(<2 x i32> %a) { +; NOVL-LABEL: ubto2f64: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: ubto2f64: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; VL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VL-NEXT: retq + %mask = icmp ne <2 x i32> %a, zeroinitializer + %1 = uitofp <2 x i1> %mask to <2 x double> + ret <2 x double> %1 +} + +define <2 x i64> @test_2f64toub(<2 x double> %a, <2 x i64> %passthru) { +; NOVLDQ-LABEL: test_2f64toub: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; NOVLDQ-NEXT: vcvttpd2udq %zmm0, %ymm0 +; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_2f64toub: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLDQ-NEXT: vpmovd2m %xmm0, %k1 +; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_2f64toub: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttpd2udq %xmm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_2f64toub: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; DQNOVL-NEXT: vcvttpd2udq %zmm0, %ymm0 +; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %mask = fptoui <2 x double> %a to <2 x i1> + %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer + ret <2 x i64> %select +} + +define <4 x i64> @test_4f64toub(<4 x double> %a, <4 x i64> %passthru) { +; NOVLDQ-LABEL: test_4f64toub: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; NOVLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_4f64toub: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLDQ-NEXT: vpmovd2m %xmm0, %k1 +; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_4f64toub: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_4f64toub: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; DQNOVL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; DQNOVL-NEXT: retq + %mask = fptoui <4 x double> %a to <4 x i1> + %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer + ret <4 x i64> %select +} + +define <8 x i64> @test_8f64toub(<8 x double> %a, <8 x i64> %passthru) { +; NOVLDQ-LABEL: test_8f64toub: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_8f64toub: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; VLDQ-NEXT: vpmovd2m %ymm0, %k1 +; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_8f64toub: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VLNODQ-NEXT: vpslld $31, %ymm0, %ymm0 +; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_8f64toub: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: retq + %mask = fptoui <8 x double> %a to <8 x i1> + %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer + ret <8 x i64> %select +} + +define <2 x i64> @test_2f32toub(<2 x float> %a, <2 x i64> %passthru) { +; NOVLDQ-LABEL: test_2f32toub: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_2f32toub: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLDQ-NEXT: vpmovd2m %xmm0, %k1 +; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_2f32toub: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_2f32toub: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %mask = fptoui <2 x float> %a to <2 x i1> + %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer + ret <2 x i64> %select +} + +define <4 x i64> @test_4f32toub(<4 x float> %a, <4 x i64> %passthru) { +; NOVLDQ-LABEL: test_4f32toub: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_4f32toub: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLDQ-NEXT: vpmovd2m %xmm0, %k1 +; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_4f32toub: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_4f32toub: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; DQNOVL-NEXT: retq + %mask = fptoui <4 x float> %a to <4 x i1> + %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer + ret <4 x i64> %select +} + +define <8 x i64> @test_8f32toub(<8 x float> %a, <8 x i64> %passthru) { +; NOVLDQ-LABEL: test_8f32toub: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_8f32toub: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; VLDQ-NEXT: vpmovd2m %ymm0, %k1 +; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_8f32toub: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; VLNODQ-NEXT: vpslld $31, %ymm0, %ymm0 +; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_8f32toub: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvttps2dq %ymm0, %ymm0 +; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: retq + %mask = fptoui <8 x float> %a to <8 x i1> + %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer + ret <8 x i64> %select +} + +define <16 x i32> @test_16f32toub(<16 x float> %a, <16 x i32> %passthru) { +; NODQ-LABEL: test_16f32toub: +; NODQ: # %bb.0: +; NODQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; NODQ-NEXT: vpslld $31, %zmm0, %zmm0 +; NODQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NODQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; NODQ-NEXT: retq +; +; VLDQ-LABEL: test_16f32toub: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; VLDQ-NEXT: vpslld $31, %zmm0, %zmm0 +; VLDQ-NEXT: vpmovd2m %zmm0, %k1 +; VLDQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: test_16f32toub: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvttps2dq %zmm0, %zmm0 +; DQNOVL-NEXT: vpslld $31, %zmm0, %zmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: retq + %mask = fptoui <16 x float> %a to <16 x i1> + %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer + ret <16 x i32> %select +} + +define <2 x i64> @test_2f64tosb(<2 x double> %a, <2 x i64> %passthru) { +; NOVLDQ-LABEL: test_2f64tosb: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; NOVLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_2f64tosb: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLDQ-NEXT: vpmovd2m %xmm0, %k1 +; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_2f64tosb: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttpd2dq %xmm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_2f64tosb: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; DQNOVL-NEXT: vcvttpd2dq %xmm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %mask = fptosi <2 x double> %a to <2 x i1> + %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer + ret <2 x i64> %select +} + +define <4 x i64> @test_4f64tosb(<4 x double> %a, <4 x i64> %passthru) { +; NOVLDQ-LABEL: test_4f64tosb: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; NOVLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_4f64tosb: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VLDQ-NEXT: vpmovd2m %xmm0, %k1 +; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_4f64tosb: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_4f64tosb: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; DQNOVL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; DQNOVL-NEXT: retq + %mask = fptosi <4 x double> %a to <4 x i1> + %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer + ret <4 x i64> %select +} + +define <8 x i64> @test_8f64tosb(<8 x double> %a, <8 x i64> %passthru) { +; NOVLDQ-LABEL: test_8f64tosb: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_8f64tosb: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VLDQ-NEXT: vpmovd2m %ymm0, %k1 +; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_8f64tosb: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_8f64tosb: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: retq + %mask = fptosi <8 x double> %a to <8 x i1> + %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer + ret <8 x i64> %select +} + +define <2 x i64> @test_2f32tosb(<2 x float> %a, <2 x i64> %passthru) { +; NOVLDQ-LABEL: test_2f32tosb: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_2f32tosb: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLDQ-NEXT: vpmovd2m %xmm0, %k1 +; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_2f32tosb: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_2f32tosb: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %mask = fptosi <2 x float> %a to <2 x i1> + %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer + ret <2 x i64> %select +} + +define <4 x i64> @test_4f32tosb(<4 x float> %a, <4 x i64> %passthru) { +; NOVLDQ-LABEL: test_4f32tosb: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_4f32tosb: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLDQ-NEXT: vpmovd2m %xmm0, %k1 +; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_4f32tosb: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_4f32tosb: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; DQNOVL-NEXT: retq + %mask = fptosi <4 x float> %a to <4 x i1> + %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer + ret <4 x i64> %select +} + +define <8 x i64> @test_8f32tosb(<8 x float> %a, <8 x i64> %passthru) { +; NOVLDQ-LABEL: test_8f32tosb: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_8f32tosb: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; VLDQ-NEXT: vpmovd2m %ymm0, %k1 +; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_8f32tosb: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_8f32tosb: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvttps2dq %ymm0, %ymm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: retq + %mask = fptosi <8 x float> %a to <8 x i1> + %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer + ret <8 x i64> %select +} + +define <16 x i32> @test_16f32tosb(<16 x float> %a, <16 x i32> %passthru) { +; NODQ-LABEL: test_16f32tosb: +; NODQ: # %bb.0: +; NODQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; NODQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NODQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; NODQ-NEXT: retq +; +; VLDQ-LABEL: test_16f32tosb: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; VLDQ-NEXT: vpmovd2m %zmm0, %k1 +; VLDQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: test_16f32tosb: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvttps2dq %zmm0, %zmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: retq + %mask = fptosi <16 x float> %a to <16 x i1> + %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer + ret <16 x i32> %select +} + +define <2 x double> @test_sito2f64_mask_load(<2 x i32> *%a, <2 x i64> %c) { +; SSE-LABEL: sitofp_load_2i32_to_2f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2pd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_load_2i32_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0 +; AVX-NEXT: retq +; NOVLDQ-LABEL: test_sito2f64_mask_load: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVLDQ-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 +; NOVLDQ-NEXT: vcvtdq2pd (%rdi), %xmm0 +; NOVLDQ-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_sito2f64_mask_load: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vpmovq2m %xmm0, %k1 +; VLDQ-NEXT: vcvtdq2pd (%rdi), %xmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_sito2f64_mask_load: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VLNODQ-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 +; VLNODQ-NEXT: vcvtdq2pd (%rdi), %xmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_sito2f64_mask_load: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; DQNOVL-NEXT: vpmovq2m %zmm0, %k1 +; DQNOVL-NEXT: vcvtdq2pd (%rdi), %xmm0 +; DQNOVL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %mask = icmp slt <2 x i64> %c, zeroinitializer + %ld = load <2 x i32>, <2 x i32> *%a + %cvt = sitofp <2 x i32> %ld to <2 x double> + %sel = select <2 x i1> %mask, <2 x double> %cvt, <2 x double> zeroinitializer + ret <2 x double> %sel +} + +define <2 x double> @test_uito2f64_mask_load(<2 x i32> *%a, <2 x i64> %c) { +; SSE-LABEL: sitofp_load_2i32_to_2f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2pd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_load_2i32_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0 +; AVX-NEXT: retq +; NOVLDQ-LABEL: test_uito2f64_mask_load: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVLDQ-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 +; NOVLDQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; NOVLDQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; NOVLDQ-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_uito2f64_mask_load: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vpmovq2m %xmm0, %k1 +; VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_uito2f64_mask_load: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VLNODQ-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 +; VLNODQ-NEXT: vcvtudq2pd (%rdi), %xmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_uito2f64_mask_load: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; DQNOVL-NEXT: vpmovq2m %zmm0, %k1 +; DQNOVL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; DQNOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 +; DQNOVL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %mask = icmp slt <2 x i64> %c, zeroinitializer + %ld = load <2 x i32>, <2 x i32> *%a + %cvt = uitofp <2 x i32> %ld to <2 x double> + %sel = select <2 x i1> %mask, <2 x double> %cvt, <2 x double> zeroinitializer + ret <2 x double> %sel +} diff --git a/llvm/test/CodeGen/X86/avx512-trunc-widen.ll b/llvm/test/CodeGen/X86/avx512-trunc-widen.ll new file mode 100644 index 00000000000..1ce08c01773 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-trunc-widen.ll @@ -0,0 +1,1035 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=KNL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,SKX + + attributes #0 = { nounwind } + +define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) #0 { +; ALL-LABEL: trunc_16x32_to_16x8: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovdb %zmm0, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i8> + ret <16 x i8> %x +} + +define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) #0 { +; ALL-LABEL: trunc_8x64_to_8x16: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovqw %zmm0, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i16> + ret <8 x i16> %x +} + +define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) #0 { +; ALL-LABEL: trunc_v16i32_to_v16i16: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovdw %zmm0, %ymm0 +; ALL-NEXT: retq + %1 = trunc <16 x i32> %x to <16 x i16> + ret <16 x i16> %1 +} + +define <8 x i8> @trunc_qb_512(<8 x i64> %i) #0 { +; ALL-LABEL: trunc_qb_512: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovqb %zmm0, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i8> + ret <8 x i8> %x +} + +define void @trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) #0 { +; ALL-LABEL: trunc_qb_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovqb %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i8> + store <8 x i8> %x, <8 x i8>* %res + ret void +} + +define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 { +; KNL-LABEL: trunc_qb_256: +; KNL: ## %bb.0: +; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; KNL-NEXT: vpmovqb %zmm0, %xmm0 +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_qb_256: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovqb %ymm0, %xmm0 +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i8> + ret <4 x i8> %x +} + +define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 { +; KNL-LABEL: trunc_qb_256_mem: +; KNL: ## %bb.0: +; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; KNL-NEXT: vpmovqb %zmm0, %xmm0 +; KNL-NEXT: vmovd %xmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_qb_256_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovqb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i8> + store <4 x i8> %x, <4 x i8>* %res + ret void +} + +define <2 x i8> @trunc_qb_128(<2 x i64> %i) #0 { +; ALL-LABEL: trunc_qb_128: +; ALL: ## %bb.0: +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i8> + ret <2 x i8> %x +} + +define void @trunc_qb_128_mem(<2 x i64> %i, <2 x i8>* %res) #0 { +; KNL-LABEL: trunc_qb_128_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; KNL-NEXT: vpextrw $0, %xmm0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_qb_128_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovqb %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i8> + store <2 x i8> %x, <2 x i8>* %res + ret void +} + +define <8 x i16> @trunc_qw_512(<8 x i64> %i) #0 { +; ALL-LABEL: trunc_qw_512: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovqw %zmm0, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i16> + ret <8 x i16> %x +} + +define void @trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) #0 { +; ALL-LABEL: trunc_qw_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovqw %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i16> + store <8 x i16> %x, <8 x i16>* %res + ret void +} + +define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 { +; KNL-LABEL: trunc_qw_256: +; KNL: ## %bb.0: +; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; KNL-NEXT: vpmovqw %zmm0, %xmm0 +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_qw_256: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovqw %ymm0, %xmm0 +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i16> + ret <4 x i16> %x +} + +define void @trunc_qw_256_mem(<4 x i64> %i, <4 x i16>* %res) #0 { +; KNL-LABEL: trunc_qw_256_mem: +; KNL: ## %bb.0: +; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; KNL-NEXT: vpmovqw %zmm0, %xmm0 +; KNL-NEXT: vmovq %xmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_qw_256_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovqw %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i16> + store <4 x i16> %x, <4 x i16>* %res + ret void +} + +define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 { +; KNL-LABEL: trunc_qw_128: +; KNL: ## %bb.0: +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_qw_128: +; SKX: ## %bb.0: +; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i16> + ret <2 x i16> %x +} + +define void @trunc_qw_128_mem(<2 x i64> %i, <2 x i16>* %res) #0 { +; KNL-LABEL: trunc_qw_128_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; KNL-NEXT: vmovd %xmm0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_qw_128_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovqw %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i16> + store <2 x i16> %x, <2 x i16>* %res + ret void +} + +define <8 x i32> @trunc_qd_512(<8 x i64> %i) #0 { +; ALL-LABEL: trunc_qd_512: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovqd %zmm0, %ymm0 +; ALL-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i32> + ret <8 x i32> %x +} + +define void @trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) #0 { +; ALL-LABEL: trunc_qd_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovqd %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i32> + store <8 x i32> %x, <8 x i32>* %res + ret void +} + +define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 { +; KNL-LABEL: trunc_qd_256: +; KNL: ## %bb.0: +; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_qd_256: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i32> + ret <4 x i32> %x +} + +define void @trunc_qd_256_mem(<4 x i64> %i, <4 x i32>* %res) #0 { +; KNL-LABEL: trunc_qd_256_mem: +; KNL: ## %bb.0: +; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: vmovdqa %xmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_qd_256_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovqd %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i32> + store <4 x i32> %x, <4 x i32>* %res + ret void +} + +define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 { +; ALL-LABEL: trunc_qd_128: +; ALL: ## %bb.0: +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; ALL-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i32> + ret <2 x i32> %x +} + +define void @trunc_qd_128_mem(<2 x i64> %i, <2 x i32>* %res) #0 { +; KNL-LABEL: trunc_qd_128_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL-NEXT: vmovlps %xmm0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_qd_128_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovqd %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i32> + store <2 x i32> %x, <2 x i32>* %res + ret void +} + +define <16 x i8> @trunc_db_512(<16 x i32> %i) #0 { +; ALL-LABEL: trunc_db_512: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovdb %zmm0, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i8> + ret <16 x i8> %x +} + +define void @trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) #0 { +; ALL-LABEL: trunc_db_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovdb %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i8> + store <16 x i8> %x, <16 x i8>* %res + ret void +} + +define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 { +; KNL-LABEL: trunc_db_256: +; KNL: ## %bb.0: +; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_db_256: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovdb %ymm0, %xmm0 +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x = trunc <8 x i32> %i to <8 x i8> + ret <8 x i8> %x +} + +define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 { +; KNL-LABEL: trunc_db_256_mem: +; KNL: ## %bb.0: +; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vmovq %xmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_db_256_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovdb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x = trunc <8 x i32> %i to <8 x i8> + store <8 x i8> %x, <8 x i8>* %res + ret void +} + +define <4 x i8> @trunc_db_128(<4 x i32> %i) #0 { +; ALL-LABEL: trunc_db_128: +; ALL: ## %bb.0: +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %x = trunc <4 x i32> %i to <4 x i8> + ret <4 x i8> %x +} + +define void @trunc_db_128_mem(<4 x i32> %i, <4 x i8>* %res) #0 { +; KNL-LABEL: trunc_db_128_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; KNL-NEXT: vmovd %xmm0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_db_128_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovdb %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i32> %i to <4 x i8> + store <4 x i8> %x, <4 x i8>* %res + ret void +} + +define <16 x i16> @trunc_dw_512(<16 x i32> %i) #0 { +; ALL-LABEL: trunc_dw_512: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovdw %zmm0, %ymm0 +; ALL-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i16> + ret <16 x i16> %x +} + +define void @trunc_dw_512_mem(<16 x i32> %i, <16 x i16>* %res) #0 { +; ALL-LABEL: trunc_dw_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovdw %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i16> + store <16 x i16> %x, <16 x i16>* %res + ret void +} + +define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 { +; KNL-LABEL: trunc_dw_256: +; KNL: ## %bb.0: +; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_dw_256: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovdw %ymm0, %xmm0 +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x = trunc <8 x i32> %i to <8 x i16> + ret <8 x i16> %x +} + +define void @trunc_dw_256_mem(<8 x i32> %i, <8 x i16>* %res) #0 { +; KNL-LABEL: trunc_dw_256_mem: +; KNL: ## %bb.0: +; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: vmovdqa %xmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_dw_256_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovdw %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x = trunc <8 x i32> %i to <8 x i16> + store <8 x i16> %x, <8 x i16>* %res + ret void +} + +define void @trunc_dw_128_mem(<4 x i32> %i, <4 x i16>* %res) #0 { +; KNL-LABEL: trunc_dw_128_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; KNL-NEXT: vmovq %xmm0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_dw_128_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovdw %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i32> %i to <4 x i16> + store <4 x i16> %x, <4 x i16>* %res + ret void +} + +define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 { +; KNL-LABEL: trunc_wb_512: +; KNL: ## %bb.0: +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; KNL-NEXT: vpmovdb %zmm1, %xmm1 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_wb_512: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovwb %zmm0, %ymm0 +; SKX-NEXT: retq + %x = trunc <32 x i16> %i to <32 x i8> + ret <32 x i8> %x +} + +define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 { +; KNL-LABEL: trunc_wb_512_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; KNL-NEXT: vpmovdb %zmm1, 16(%rdi) +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; KNL-NEXT: vpmovdb %zmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_wb_512_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovwb %zmm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x = trunc <32 x i16> %i to <32 x i8> + store <32 x i8> %x, <32 x i8>* %res + ret void +} + +define <16 x i8> @trunc_wb_256(<16 x i16> %i) #0 { +; KNL-LABEL: trunc_wb_256: +; KNL: ## %bb.0: +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_wb_256: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovwb %ymm0, %xmm0 +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x = trunc <16 x i16> %i to <16 x i8> + ret <16 x i8> %x +} + +define void @trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) #0 { +; KNL-LABEL: trunc_wb_256_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; KNL-NEXT: vpmovdb %zmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_wb_256_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovwb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x = trunc <16 x i16> %i to <16 x i8> + store <16 x i8> %x, <16 x i8>* %res + ret void +} + +define <8 x i8> @trunc_wb_128(<8 x i16> %i) #0 { +; ALL-LABEL: trunc_wb_128: +; ALL: ## %bb.0: +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %x = trunc <8 x i16> %i to <8 x i8> + ret <8 x i8> %x +} + +define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 { +; KNL-LABEL: trunc_wb_128_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; KNL-NEXT: vmovq %xmm0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_wb_128_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovwb %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i16> %i to <8 x i8> + store <8 x i8> %x, <8 x i8>* %res + ret void +} + + +define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) { +; KNL-LABEL: usat_trunc_wb_256_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; KNL-NEXT: vpmovdb %zmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: usat_trunc_wb_256_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovuswb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %x6 = trunc <16 x i16> %x5 to <16 x i8> + store <16 x i8> %x6, <16 x i8>* %res, align 1 + ret void +} + +define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) { +; KNL-LABEL: usat_trunc_wb_256: +; KNL: ## %bb.0: +; KNL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: usat_trunc_wb_256: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovuswb %ymm0, %xmm0 +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %x6 = trunc <16 x i16> %x5 to <16 x i8> + ret <16 x i8> %x6 +} + +define void @usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) { +; KNL-LABEL: usat_trunc_wb_128_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovq %xmm0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: usat_trunc_wb_128_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovuswb %xmm0, (%rdi) +; SKX-NEXT: retq + %x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %x6 = trunc <8 x i16> %x5 to <8 x i8> + store <8 x i8> %x6, <8 x i8>* %res, align 1 + ret void +} + +define void @usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) { +; ALL-LABEL: usat_trunc_db_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovusdb %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x3 = icmp ult <16 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %x6 = trunc <16 x i32> %x5 to <16 x i8> + store <16 x i8> %x6, <16 x i8>* %res, align 1 + ret void +} + +define void @usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) { +; ALL-LABEL: usat_trunc_qb_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovusqb %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x3 = icmp ult <8 x i64> %i, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> + %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> + %x6 = trunc <8 x i64> %x5 to <8 x i8> + store <8 x i8> %x6, <8 x i8>* %res, align 1 + ret void +} + +define void @usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) { +; ALL-LABEL: usat_trunc_qd_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovusqd %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x3 = icmp ult <8 x i64> %i, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> + %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> + %x6 = trunc <8 x i64> %x5 to <8 x i32> + store <8 x i32> %x6, <8 x i32>* %res, align 1 + ret void +} + +define void @usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) { +; ALL-LABEL: usat_trunc_qw_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovusqw %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x3 = icmp ult <8 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535> + %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535> + %x6 = trunc <8 x i64> %x5 to <8 x i16> + store <8 x i16> %x6, <8 x i16>* %res, align 1 + ret void +} + +define <32 x i8> @usat_trunc_db_1024(<32 x i32> %i) { +; ALL-LABEL: usat_trunc_db_1024: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovusdb %zmm0, %xmm0 +; ALL-NEXT: vpmovusdb %zmm1, %xmm1 +; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: retq + %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %x6 = trunc <32 x i32> %x5 to <32 x i8> + ret <32 x i8> %x6 +} + +define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) { +; ALL-LABEL: usat_trunc_db_1024_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovusdb %zmm0, %xmm0 +; ALL-NEXT: vpmovusdb %zmm1, %xmm1 +; ALL-NEXT: vmovdqu %xmm1, 16(%rdi) +; ALL-NEXT: vmovdqu %xmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %x6 = trunc <32 x i32> %x5 to <32 x i8> + store <32 x i8>%x6, <32 x i8>* %p, align 1 + ret void +} + +define <16 x i16> @usat_trunc_dw_512(<16 x i32> %i) { +; ALL-LABEL: usat_trunc_dw_512: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovusdw %zmm0, %ymm0 +; ALL-NEXT: retq + %x3 = icmp ult <16 x i32> %i, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> + %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> + %x6 = trunc <16 x i32> %x5 to <16 x i16> + ret <16 x i16> %x6 +} + +define <8 x i8> @usat_trunc_wb_128(<8 x i16> %i) { +; ALL-LABEL: usat_trunc_wb_128: +; ALL: ## %bb.0: +; ALL-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0 +; ALL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; ALL-NEXT: retq + %x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %x6 = trunc <8 x i16> %x5 to <8 x i8> + ret <8 x i8>%x6 +} + +define <16 x i16> @usat_trunc_qw_1024(<16 x i64> %i) { +; ALL-LABEL: usat_trunc_qw_1024: +; ALL: ## %bb.0: +; ALL-NEXT: vpmovusqw %zmm0, %xmm0 +; ALL-NEXT: vpmovusqw %zmm1, %xmm1 +; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: retq + %x3 = icmp ult <16 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535> + %x5 = select <16 x i1> %x3, <16 x i64> %i, <16 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535> + %x6 = trunc <16 x i64> %x5 to <16 x i16> + ret <16 x i16> %x6 +} + +define <16 x i8> @usat_trunc_db_256(<8 x i32> %x) { +; KNL-LABEL: usat_trunc_db_256: +; KNL: ## %bb.0: +; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; KNL-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: usat_trunc_db_256: +; SKX: ## %bb.0: +; SKX-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; SKX-NEXT: vpmovdb %ymm0, %xmm0 +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %tmp1 = icmp ult <8 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %tmp2 = select <8 x i1> %tmp1, <8 x i32> %x, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %tmp3 = trunc <8 x i32> %tmp2 to <8 x i8> + %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %tmp4 +} + + + +; Tests for the following unsigned saturation pattern: + +; %a = icmp sgt %x, C1 +; %b = select %a, %x, C2 +; %c = icmp slt %b, C2 +; %d = select %c, %b, C2 +; %res = trunc %d + + +define void @smax_usat_trunc_wb_256_mem1(<16 x i16> %i, <16 x i8>* %res) { +; KNL-LABEL: smax_usat_trunc_wb_256_mem1: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; KNL-NEXT: vpmovdb %zmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: smax_usat_trunc_wb_256_mem1: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vpmovuswb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x1 = icmp sgt <16 x i16> %i, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> + %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> + %x3 = icmp slt <16 x i16> %x2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %x6 = trunc <16 x i16> %x5 to <16 x i8> + store <16 x i8> %x6, <16 x i8>* %res, align 1 + ret void +} + +; Test for smax(smin(x, C2), C1). +define void @smax_usat_trunc_wb_256_mem2(<16 x i16> %i, <16 x i8>* %res) { +; KNL-LABEL: smax_usat_trunc_wb_256_mem2: +; KNL: ## %bb.0: +; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; KNL-NEXT: vpmovdb %zmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: smax_usat_trunc_wb_256_mem2: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vpmovuswb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x1 = icmp slt <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %x3 = icmp sgt <16 x i16> %x2, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> + %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> + %x6 = trunc <16 x i16> %x5 to <16 x i8> + store <16 x i8> %x6, <16 x i8>* %res, align 1 + ret void +} + +define <16 x i8> @smax_usat_trunc_wb_256(<16 x i16> %i) { +; KNL-LABEL: smax_usat_trunc_wb_256: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: smax_usat_trunc_wb_256: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vpmovuswb %ymm0, %xmm0 +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x1 = icmp sgt <16 x i16> %i, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> + %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> + %x3 = icmp slt <16 x i16> %x2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %x6 = trunc <16 x i16> %x5 to <16 x i8> + ret <16 x i8> %x6 + } + +define void @smax_usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) { +; KNL-LABEL: smax_usat_trunc_wb_128_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovq %xmm0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: smax_usat_trunc_wb_128_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; SKX-NEXT: vpmovuswb %xmm0, (%rdi) +; SKX-NEXT: retq + %x1 = icmp sgt <8 x i16> %i, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> + %x2 = select <8 x i1> %x1, <8 x i16> %i, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> + %x3 = icmp slt <8 x i16> %x2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %x5 = select <8 x i1> %x3, <8 x i16> %x2, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %x6 = trunc <8 x i16> %x5 to <8 x i8> + store <8 x i8> %x6, <8 x i8>* %res, align 1 + ret void +} + +define void @smax_usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) { +; ALL-LABEL: smax_usat_trunc_db_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpmovusdb %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x1 = icmp sgt <16 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + %x2 = select <16 x i1> %x1, <16 x i32> %i, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + %x3 = icmp slt <16 x i32> %x2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %x5 = select <16 x i1> %x3, <16 x i32> %x2, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %x6 = trunc <16 x i32> %x5 to <16 x i8> + store <16 x i8> %x6, <16 x i8>* %res, align 1 + ret void +} + +define void @smax_usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) { +; ALL-LABEL: smax_usat_trunc_qb_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpmovusqb %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x1 = icmp sgt <8 x i64> %i, <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0> + %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0> + %x3 = icmp slt <8 x i64> %x2, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> + %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> + %x6 = trunc <8 x i64> %x5 to <8 x i8> + store <8 x i8> %x6, <8 x i8>* %res, align 1 + ret void +} + +define void @smax_usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) { +; ALL-LABEL: smax_usat_trunc_qd_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpmovusqd %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x1 = icmp sgt <8 x i64> %i, <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0> + %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0> + %x3 = icmp slt <8 x i64> %x2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> + %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> + %x6 = trunc <8 x i64> %x5 to <8 x i32> + store <8 x i32> %x6, <8 x i32>* %res, align 1 + ret void +} + +define void @smax_usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) { +; ALL-LABEL: smax_usat_trunc_qw_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpmovusqw %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x1 = icmp sgt <8 x i64> %i, <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0> + %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0> + %x3 = icmp slt <8 x i64> %x2, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535> + %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535> + %x6 = trunc <8 x i64> %x5 to <8 x i16> + store <8 x i16> %x6, <8 x i16>* %res, align 1 + ret void +} + +define <32 x i8> @smax_usat_trunc_db_1024(<32 x i32> %i) { +; ALL-LABEL: smax_usat_trunc_db_1024: +; ALL: ## %bb.0: +; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; ALL-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 +; ALL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 +; ALL-NEXT: vpmovusdb %zmm0, %xmm0 +; ALL-NEXT: vpmovusdb %zmm1, %xmm1 +; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: retq + %x1 = icmp sgt <32 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + %x2 = select <32 x i1> %x1, <32 x i32> %i, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + %x3 = icmp slt <32 x i32> %x2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %x5 = select <32 x i1> %x3, <32 x i32> %x2, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %x6 = trunc <32 x i32> %x5 to <32 x i8> + ret <32 x i8> %x6 +} + +define void @smax_usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) { +; ALL-LABEL: smax_usat_trunc_db_1024_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; ALL-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 +; ALL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 +; ALL-NEXT: vpmovusdb %zmm0, %xmm0 +; ALL-NEXT: vpmovusdb %zmm1, %xmm1 +; ALL-NEXT: vmovdqu %xmm1, 16(%rdi) +; ALL-NEXT: vmovdqu %xmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x1 = icmp sgt <32 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + %x2 = select <32 x i1> %x1, <32 x i32> %i, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + %x3 = icmp slt <32 x i32> %x2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %x5 = select <32 x i1> %x3, <32 x i32> %x2, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %x6 = trunc <32 x i32> %x5 to <32 x i8> + store <32 x i8>%x6, <32 x i8>* %p, align 1 + ret void +} + +define <16 x i16> @smax_usat_trunc_dw_512(<16 x i32> %i) { +; ALL-LABEL: smax_usat_trunc_dw_512: +; ALL: ## %bb.0: +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpmovusdw %zmm0, %ymm0 +; ALL-NEXT: retq + %x1 = icmp sgt <16 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + %x2 = select <16 x i1> %x1, <16 x i32> %i, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + %x3 = icmp slt <16 x i32> %x2, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> + %x5 = select <16 x i1> %x3, <16 x i32> %x2, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> + %x6 = trunc <16 x i32> %x5 to <16 x i16> + ret <16 x i16> %x6 +} + +define void @negative_test1_smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) { +; KNL-LABEL: negative_test1_smax_usat_trunc_wb_256_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; KNL-NEXT: vpmovdb %zmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: negative_test1_smax_usat_trunc_wb_256_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; SKX-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vpmovwb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x1 = icmp sgt <16 x i16> %i, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> + %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> + %x3 = icmp slt <16 x i16> %x2, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> + %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> + %x6 = trunc <16 x i16> %x5 to <16 x i8> + store <16 x i8> %x6, <16 x i8>* %res, align 1 + ret void +} + +define void @negative_test2_smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) { +; KNL-LABEL: negative_test2_smax_usat_trunc_wb_256_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; KNL-NEXT: vpmovdb %zmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: negative_test2_smax_usat_trunc_wb_256_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vpmovwb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x1 = icmp sgt <16 x i16> %i, <i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10> + %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10> + %x3 = icmp slt <16 x i16> %x2, <i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5> + %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5> + %x6 = trunc <16 x i16> %x5 to <16 x i8> + store <16 x i8> %x6, <16 x i8>* %res, align 1 + ret void +} diff --git a/llvm/test/CodeGen/X86/bswap-vector.ll b/llvm/test/CodeGen/X86/bswap-vector.ll index 4bb2b764ccc..f6473f8fc2f 100644 --- a/llvm/test/CodeGen/X86/bswap-vector.ll +++ b/llvm/test/CodeGen/X86/bswap-vector.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-SSE --check-prefix=CHECK-NOSSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-SSE --check-prefix=CHECK-SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-AVX --check-prefix=CHECK-AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-WIDE-AVX --check-prefix=CHECK-WIDE-AVX2 declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>) declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) @@ -30,6 +31,11 @@ define <8 x i16> @test1(<8 x i16> %v) { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; CHECK-AVX-NEXT: retq +; +; CHECK-WIDE-AVX-LABEL: test1: +; CHECK-WIDE-AVX: # %bb.0: # %entry +; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v) ret <8 x i16> %r @@ -58,6 +64,11 @@ define <4 x i32> @test2(<4 x i32> %v) { ; CHECK-AVX: # %bb.0: ; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; CHECK-AVX-NEXT: retq +; +; CHECK-WIDE-AVX-LABEL: test2: +; CHECK-WIDE-AVX: # %bb.0: +; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-WIDE-AVX-NEXT: retq %r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %v) ret <4 x i32> %r } @@ -88,6 +99,12 @@ define <4 x i32> @or_bswap(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %p1, <4 x i32> ; CHECK-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; CHECK-AVX-NEXT: retq +; +; CHECK-WIDE-AVX-LABEL: or_bswap: +; CHECK-WIDE-AVX: # %bb.0: +; CHECK-WIDE-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-WIDE-AVX-NEXT: retq %xt = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %x) %yt = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %y) %r = or <4 x i32> %xt, %yt @@ -119,6 +136,11 @@ define <2 x i64> @test3(<2 x i64> %v) { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] ; CHECK-AVX-NEXT: retq +; +; CHECK-WIDE-AVX-LABEL: test3: +; CHECK-WIDE-AVX: # %bb.0: # %entry +; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v) ret <2 x i64> %r @@ -161,6 +183,11 @@ define <16 x i16> @test4(<16 x i16> %v) { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] ; CHECK-AVX-NEXT: retq +; +; CHECK-WIDE-AVX-LABEL: test4: +; CHECK-WIDE-AVX: # %bb.0: # %entry +; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] +; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %v) ret <16 x i16> %r @@ -199,6 +226,11 @@ define <8 x i32> @test5(<8 x i32> %v) { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] ; CHECK-AVX-NEXT: retq +; +; CHECK-WIDE-AVX-LABEL: test5: +; CHECK-WIDE-AVX: # %bb.0: # %entry +; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] +; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %v) ret <8 x i32> %r @@ -241,6 +273,11 @@ define <4 x i64> @test6(<4 x i64> %v) { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] ; CHECK-AVX-NEXT: retq +; +; CHECK-WIDE-AVX-LABEL: test6: +; CHECK-WIDE-AVX: # %bb.0: # %entry +; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] +; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %v) ret <4 x i64> %r @@ -271,6 +308,11 @@ define <4 x i16> @test7(<4 x i16> %v) { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; CHECK-AVX-NEXT: retq +; +; CHECK-WIDE-AVX-LABEL: test7: +; CHECK-WIDE-AVX: # %bb.0: # %entry +; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %v) ret <4 x i16> %r @@ -364,6 +406,11 @@ define <8 x i16> @fold_v8i16() { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536] ; CHECK-AVX-NEXT: retq +; +; CHECK-WIDE-AVX-LABEL: fold_v8i16: +; CHECK-WIDE-AVX: # %bb.0: # %entry +; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536] +; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> <i16 0, i16 1, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6>) ret <8 x i16> %r @@ -379,6 +426,11 @@ define <4 x i32> @fold_v4i32() { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863] ; CHECK-AVX-NEXT: retq +; +; CHECK-WIDE-AVX-LABEL: fold_v4i32: +; CHECK-WIDE-AVX: # %bb.0: # %entry +; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863] +; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> <i32 0, i32 -1, i32 2, i32 -3>) ret <4 x i32> %r @@ -394,6 +446,11 @@ define <2 x i64> @fold_v2i64() { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615] ; CHECK-AVX-NEXT: retq +; +; CHECK-WIDE-AVX-LABEL: fold_v2i64: +; CHECK-WIDE-AVX: # %bb.0: # %entry +; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615] +; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> <i64 255, i64 -1>) ret <2 x i64> %r @@ -410,6 +467,11 @@ define <16 x i16> @fold_v16i16() { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584] ; CHECK-AVX-NEXT: retq +; +; CHECK-WIDE-AVX-LABEL: fold_v16i16: +; CHECK-WIDE-AVX: # %bb.0: # %entry +; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584] +; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> <i16 0, i16 1, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14>) ret <16 x i16> %r @@ -426,6 +488,11 @@ define <8 x i32> @fold_v8i32() { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296] ; CHECK-AVX-NEXT: retq +; +; CHECK-WIDE-AVX-LABEL: fold_v8i32: +; CHECK-WIDE-AVX: # %bb.0: # %entry +; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296] +; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> <i32 0, i32 1, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6>) ret <8 x i32> %r @@ -442,6 +509,11 @@ define <4 x i64> @fold_v4i64() { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160] ; CHECK-AVX-NEXT: retq +; +; CHECK-WIDE-AVX-LABEL: fold_v4i64: +; CHECK-WIDE-AVX: # %bb.0: # %entry +; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160] +; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> <i64 255, i64 -1, i64 65535, i64 16776960>) ret <4 x i64> %r diff --git a/llvm/test/CodeGen/X86/lower-bitcast.ll b/llvm/test/CodeGen/X86/lower-bitcast.ll index d41d2ea83bc..e31c9f8967c 100644 --- a/llvm/test/CodeGen/X86/lower-bitcast.ll +++ b/llvm/test/CodeGen/X86/lower-bitcast.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=core2 -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=core2 -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE ; FIXME: Ideally we should be able to fold the entire body of @test1 into a ; single paddd instruction. At the moment we produce the sequence @@ -10,6 +11,11 @@ define double @test1(double %A) { ; CHECK: # %bb.0: ; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq +; +; CHECK-WIDE-LABEL: test1: +; CHECK-WIDE: # %bb.0: +; CHECK-WIDE-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-WIDE-NEXT: retq %1 = bitcast double %A to <2 x i32> %add = add <2 x i32> %1, <i32 3, i32 5> %2 = bitcast <2 x i32> %add to double @@ -21,6 +27,11 @@ define double @test2(double %A, double %B) { ; CHECK: # %bb.0: ; CHECK-NEXT: paddd %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; CHECK-WIDE-LABEL: test2: +; CHECK-WIDE: # %bb.0: +; CHECK-WIDE-NEXT: paddd %xmm1, %xmm0 +; CHECK-WIDE-NEXT: retq %1 = bitcast double %A to <2 x i32> %2 = bitcast double %B to <2 x i32> %add = add <2 x i32> %1, %2 @@ -35,6 +46,13 @@ define i64 @test3(i64 %A) { ; CHECK-NEXT: addps {{.*}}(%rip), %xmm0 ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: retq +; +; CHECK-WIDE-LABEL: test3: +; CHECK-WIDE: # %bb.0: +; CHECK-WIDE-NEXT: movq %rdi, %xmm0 +; CHECK-WIDE-NEXT: addps {{.*}}(%rip), %xmm0 +; CHECK-WIDE-NEXT: movq %xmm0, %rax +; CHECK-WIDE-NEXT: retq %1 = bitcast i64 %A to <2 x float> %add = fadd <2 x float> %1, <float 3.0, float 5.0> %2 = bitcast <2 x float> %add to i64 @@ -51,6 +69,13 @@ define i64 @test4(i64 %A) { ; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: retq +; +; CHECK-WIDE-LABEL: test4: +; CHECK-WIDE: # %bb.0: +; CHECK-WIDE-NEXT: movq %rdi, %xmm0 +; CHECK-WIDE-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-WIDE-NEXT: movq %xmm0, %rax +; CHECK-WIDE-NEXT: retq %1 = bitcast i64 %A to <2 x i32> %add = add <2 x i32> %1, <i32 3, i32 5> %2 = bitcast <2 x i32> %add to i64 @@ -62,6 +87,11 @@ define double @test5(double %A) { ; CHECK: # %bb.0: ; CHECK-NEXT: addps {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq +; +; CHECK-WIDE-LABEL: test5: +; CHECK-WIDE: # %bb.0: +; CHECK-WIDE-NEXT: addps {{.*}}(%rip), %xmm0 +; CHECK-WIDE-NEXT: retq %1 = bitcast double %A to <2 x float> %add = fadd <2 x float> %1, <float 3.0, float 5.0> %2 = bitcast <2 x float> %add to double @@ -76,6 +106,11 @@ define double @test6(double %A) { ; CHECK: # %bb.0: ; CHECK-NEXT: paddw {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq +; +; CHECK-WIDE-LABEL: test6: +; CHECK-WIDE: # %bb.0: +; CHECK-WIDE-NEXT: paddw {{.*}}(%rip), %xmm0 +; CHECK-WIDE-NEXT: retq %1 = bitcast double %A to <4 x i16> %add = add <4 x i16> %1, <i16 3, i16 4, i16 5, i16 6> %2 = bitcast <4 x i16> %add to double @@ -87,6 +122,11 @@ define double @test7(double %A, double %B) { ; CHECK: # %bb.0: ; CHECK-NEXT: paddw %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; CHECK-WIDE-LABEL: test7: +; CHECK-WIDE: # %bb.0: +; CHECK-WIDE-NEXT: paddw %xmm1, %xmm0 +; CHECK-WIDE-NEXT: retq %1 = bitcast double %A to <4 x i16> %2 = bitcast double %B to <4 x i16> %add = add <4 x i16> %1, %2 @@ -103,6 +143,11 @@ define double @test8(double %A) { ; CHECK: # %bb.0: ; CHECK-NEXT: paddb {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq +; +; CHECK-WIDE-LABEL: test8: +; CHECK-WIDE: # %bb.0: +; CHECK-WIDE-NEXT: paddb {{.*}}(%rip), %xmm0 +; CHECK-WIDE-NEXT: retq %1 = bitcast double %A to <8 x i8> %add = add <8 x i8> %1, <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10> %2 = bitcast <8 x i8> %add to double @@ -114,6 +159,11 @@ define double @test9(double %A, double %B) { ; CHECK: # %bb.0: ; CHECK-NEXT: paddb %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; CHECK-WIDE-LABEL: test9: +; CHECK-WIDE: # %bb.0: +; CHECK-WIDE-NEXT: paddb %xmm1, %xmm0 +; CHECK-WIDE-NEXT: retq %1 = bitcast double %A to <8 x i8> %2 = bitcast double %B to <8 x i8> %add = add <8 x i8> %1, %2 diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll index fc6341640c4..fc9aa62b948 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll @@ -1,7 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=CHECK --check-prefix=WIDEN --check-prefix=WIDEN_SKX -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=CHECK --check-prefix=WIDEN --check-prefix=WIDEN_KNL -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s | FileCheck %s --check-prefix=WIDEN_AVX2 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq -x86-experimental-vector-widening-legalization < %s | FileCheck %s --check-prefix=CHECK --check-prefix=WIDEN --check-prefix=WIDEN_SKX +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -x86-experimental-vector-widening-legalization < %s | FileCheck %s --check-prefix=CHECK --check-prefix=WIDEN --check-prefix=WIDEN_KNL +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=CHECK --check-prefix=PROMOTE --check-prefix=PROMOTE_SKX +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=CHECK --check-prefix=PROMOTE --check-prefix=PROMOTE_KNL +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -x86-experimental-vector-widening-legalization < %s | FileCheck %s --check-prefix=WIDEN_AVX2 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s | FileCheck %s --check-prefix=PROMOTE_AVX2 define <2 x double> @test_gather_v2i32_index(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) { ; WIDEN_SKX-LABEL: test_gather_v2i32_index: @@ -25,12 +28,40 @@ define <2 x double> @test_gather_v2i32_index(double* %base, <2 x i32> %ind, <2 x ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; +; PROMOTE_SKX-LABEL: test_gather_v2i32_index: +; PROMOTE_SKX: # %bb.0: +; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 +; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1 +; PROMOTE_SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %xmm2 {%k1} +; PROMOTE_SKX-NEXT: vmovapd %xmm2, %xmm0 +; PROMOTE_SKX-NEXT: retq +; +; PROMOTE_KNL-LABEL: test_gather_v2i32_index: +; PROMOTE_KNL: # %bb.0: +; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 +; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 +; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 +; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 +; PROMOTE_KNL-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1} +; PROMOTE_KNL-NEXT: vmovapd %xmm2, %xmm0 +; PROMOTE_KNL-NEXT: vzeroupper +; PROMOTE_KNL-NEXT: retq +; ; WIDEN_AVX2-LABEL: test_gather_v2i32_index: ; WIDEN_AVX2: # %bb.0: ; WIDEN_AVX2-NEXT: vpsllq $63, %xmm1, %xmm1 ; WIDEN_AVX2-NEXT: vgatherdpd %xmm1, (%rdi,%xmm0,8), %xmm2 ; WIDEN_AVX2-NEXT: vmovapd %xmm2, %xmm0 ; WIDEN_AVX2-NEXT: retq +; +; PROMOTE_AVX2-LABEL: test_gather_v2i32_index: +; PROMOTE_AVX2: # %bb.0: +; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm1, %xmm1 +; PROMOTE_AVX2-NEXT: vgatherdpd %xmm1, (%rdi,%xmm0,8), %xmm2 +; PROMOTE_AVX2-NEXT: vmovapd %xmm2, %xmm0 +; PROMOTE_AVX2-NEXT: retq %gep.random = getelementptr double, double* %base, <2 x i32> %ind %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0) ret <2 x double> %res @@ -56,6 +87,25 @@ define void @test_scatter_v2i32_index(<2 x double> %a1, double* %base, <2 x i32> ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; +; PROMOTE_SKX-LABEL: test_scatter_v2i32_index: +; PROMOTE_SKX: # %bb.0: +; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 +; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1 +; PROMOTE_SKX-NEXT: vscatterdpd %xmm0, (%rdi,%xmm1,8) {%k1} +; PROMOTE_SKX-NEXT: retq +; +; PROMOTE_KNL-LABEL: test_scatter_v2i32_index: +; PROMOTE_KNL: # %bb.0: +; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 +; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 +; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 +; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 +; PROMOTE_KNL-NEXT: vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1} +; PROMOTE_KNL-NEXT: vzeroupper +; PROMOTE_KNL-NEXT: retq +; ; WIDEN_AVX2-LABEL: test_scatter_v2i32_index: ; WIDEN_AVX2: # %bb.0: ; WIDEN_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 @@ -81,6 +131,32 @@ define void @test_scatter_v2i32_index(<2 x double> %a1, double* %base, <2 x i32> ; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax ; WIDEN_AVX2-NEXT: vmovhps %xmm0, (%rax) ; WIDEN_AVX2-NEXT: retq +; +; PROMOTE_AVX2-LABEL: test_scatter_v2i32_index: +; PROMOTE_AVX2: # %bb.0: +; PROMOTE_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 +; PROMOTE_AVX2-NEXT: vpsllq $3, %xmm1, %xmm1 +; PROMOTE_AVX2-NEXT: vmovq %rdi, %xmm3 +; PROMOTE_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3 +; PROMOTE_AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 +; PROMOTE_AVX2-NEXT: vmovmskpd %xmm2, %eax +; PROMOTE_AVX2-NEXT: testb $1, %al +; PROMOTE_AVX2-NEXT: jne .LBB1_1 +; PROMOTE_AVX2-NEXT: # %bb.2: # %else +; PROMOTE_AVX2-NEXT: testb $2, %al +; PROMOTE_AVX2-NEXT: jne .LBB1_3 +; PROMOTE_AVX2-NEXT: .LBB1_4: # %else2 +; PROMOTE_AVX2-NEXT: retq +; PROMOTE_AVX2-NEXT: .LBB1_1: # %cond.store +; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rcx +; PROMOTE_AVX2-NEXT: vmovlps %xmm0, (%rcx) +; PROMOTE_AVX2-NEXT: testb $2, %al +; PROMOTE_AVX2-NEXT: je .LBB1_4 +; PROMOTE_AVX2-NEXT: .LBB1_3: # %cond.store1 +; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax +; PROMOTE_AVX2-NEXT: vmovhps %xmm0, (%rax) +; PROMOTE_AVX2-NEXT: retq %gep = getelementptr double, double *%base, <2 x i32> %ind call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask) ret void @@ -108,6 +184,27 @@ define <2 x i32> @test_gather_v2i32_data(<2 x i32*> %ptr, <2 x i1> %mask, <2 x i ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; +; PROMOTE_SKX-LABEL: test_gather_v2i32_data: +; PROMOTE_SKX: # %bb.0: +; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 +; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1 +; PROMOTE_SKX-NEXT: vpgatherqd (,%xmm0), %xmm2 {%k1} +; PROMOTE_SKX-NEXT: vmovdqa %xmm2, %xmm0 +; PROMOTE_SKX-NEXT: retq +; +; PROMOTE_KNL-LABEL: test_gather_v2i32_data: +; PROMOTE_KNL: # %bb.0: +; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 +; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 +; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 +; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 +; PROMOTE_KNL-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1} +; PROMOTE_KNL-NEXT: vmovdqa %xmm2, %xmm0 +; PROMOTE_KNL-NEXT: vzeroupper +; PROMOTE_KNL-NEXT: retq +; ; WIDEN_AVX2-LABEL: test_gather_v2i32_data: ; WIDEN_AVX2: # %bb.0: ; WIDEN_AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -115,6 +212,14 @@ define <2 x i32> @test_gather_v2i32_data(<2 x i32*> %ptr, <2 x i1> %mask, <2 x i ; WIDEN_AVX2-NEXT: vpgatherqd %xmm1, (,%xmm0), %xmm2 ; WIDEN_AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; WIDEN_AVX2-NEXT: retq +; +; PROMOTE_AVX2-LABEL: test_gather_v2i32_data: +; PROMOTE_AVX2: # %bb.0: +; PROMOTE_AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; PROMOTE_AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; PROMOTE_AVX2-NEXT: vpgatherqd %xmm1, (,%xmm0), %xmm2 +; PROMOTE_AVX2-NEXT: vmovdqa %xmm2, %xmm0 +; PROMOTE_AVX2-NEXT: retq %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptr, i32 4, <2 x i1> %mask, <2 x i32> %src0) ret <2 x i32>%res } @@ -139,6 +244,25 @@ define void @test_scatter_v2i32_data(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mas ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; +; PROMOTE_SKX-LABEL: test_scatter_v2i32_data: +; PROMOTE_SKX: # %bb.0: +; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 +; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1 +; PROMOTE_SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1} +; PROMOTE_SKX-NEXT: retq +; +; PROMOTE_KNL-LABEL: test_scatter_v2i32_data: +; PROMOTE_KNL: # %bb.0: +; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 +; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 +; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 +; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 +; PROMOTE_KNL-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; PROMOTE_KNL-NEXT: vzeroupper +; PROMOTE_KNL-NEXT: retq +; ; WIDEN_AVX2-LABEL: test_scatter_v2i32_data: ; WIDEN_AVX2: # %bb.0: ; WIDEN_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 @@ -159,6 +283,27 @@ define void @test_scatter_v2i32_data(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mas ; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax ; WIDEN_AVX2-NEXT: vextractps $1, %xmm0, (%rax) ; WIDEN_AVX2-NEXT: retq +; +; PROMOTE_AVX2-LABEL: test_scatter_v2i32_data: +; PROMOTE_AVX2: # %bb.0: +; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 +; PROMOTE_AVX2-NEXT: vmovmskpd %xmm2, %eax +; PROMOTE_AVX2-NEXT: testb $1, %al +; PROMOTE_AVX2-NEXT: jne .LBB3_1 +; PROMOTE_AVX2-NEXT: # %bb.2: # %else +; PROMOTE_AVX2-NEXT: testb $2, %al +; PROMOTE_AVX2-NEXT: jne .LBB3_3 +; PROMOTE_AVX2-NEXT: .LBB3_4: # %else2 +; PROMOTE_AVX2-NEXT: retq +; PROMOTE_AVX2-NEXT: .LBB3_1: # %cond.store +; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rcx +; PROMOTE_AVX2-NEXT: vmovss %xmm0, (%rcx) +; PROMOTE_AVX2-NEXT: testb $2, %al +; PROMOTE_AVX2-NEXT: je .LBB3_4 +; PROMOTE_AVX2-NEXT: .LBB3_3: # %cond.store1 +; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax +; PROMOTE_AVX2-NEXT: vextractps $1, %xmm0, (%rax) +; PROMOTE_AVX2-NEXT: retq call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) ret void } @@ -185,6 +330,27 @@ define <2 x i32> @test_gather_v2i32_data_index(i32* %base, <2 x i32> %ind, <2 x ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; +; PROMOTE_SKX-LABEL: test_gather_v2i32_data_index: +; PROMOTE_SKX: # %bb.0: +; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 +; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1 +; PROMOTE_SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1} +; PROMOTE_SKX-NEXT: vmovdqa %xmm2, %xmm0 +; PROMOTE_SKX-NEXT: retq +; +; PROMOTE_KNL-LABEL: test_gather_v2i32_data_index: +; PROMOTE_KNL: # %bb.0: +; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 +; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 +; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 +; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 +; PROMOTE_KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} +; PROMOTE_KNL-NEXT: vmovdqa %xmm2, %xmm0 +; PROMOTE_KNL-NEXT: vzeroupper +; PROMOTE_KNL-NEXT: retq +; ; WIDEN_AVX2-LABEL: test_gather_v2i32_data_index: ; WIDEN_AVX2: # %bb.0: ; WIDEN_AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero @@ -192,6 +358,14 @@ define <2 x i32> @test_gather_v2i32_data_index(i32* %base, <2 x i32> %ind, <2 x ; WIDEN_AVX2-NEXT: vpgatherdd %xmm1, (%rdi,%xmm0,4), %xmm2 ; WIDEN_AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; WIDEN_AVX2-NEXT: retq +; +; PROMOTE_AVX2-LABEL: test_gather_v2i32_data_index: +; PROMOTE_AVX2: # %bb.0: +; PROMOTE_AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero +; PROMOTE_AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; PROMOTE_AVX2-NEXT: vpgatherdd %xmm1, (%rdi,%xmm0,4), %xmm2 +; PROMOTE_AVX2-NEXT: vmovdqa %xmm2, %xmm0 +; PROMOTE_AVX2-NEXT: retq %gep.random = getelementptr i32, i32* %base, <2 x i32> %ind %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) ret <2 x i32> %res @@ -217,6 +391,25 @@ define void @test_scatter_v2i32_data_index(<2 x i32> %a1, i32* %base, <2 x i32> ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; +; PROMOTE_SKX-LABEL: test_scatter_v2i32_data_index: +; PROMOTE_SKX: # %bb.0: +; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 +; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1 +; PROMOTE_SKX-NEXT: vpscatterdd %xmm0, (%rdi,%xmm1,4) {%k1} +; PROMOTE_SKX-NEXT: retq +; +; PROMOTE_KNL-LABEL: test_scatter_v2i32_data_index: +; PROMOTE_KNL: # %bb.0: +; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 +; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 +; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 +; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 +; PROMOTE_KNL-NEXT: vpscatterdd %zmm0, (%rdi,%zmm1,4) {%k1} +; PROMOTE_KNL-NEXT: vzeroupper +; PROMOTE_KNL-NEXT: retq +; ; WIDEN_AVX2-LABEL: test_scatter_v2i32_data_index: ; WIDEN_AVX2: # %bb.0: ; WIDEN_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 @@ -242,6 +435,32 @@ define void @test_scatter_v2i32_data_index(<2 x i32> %a1, i32* %base, <2 x i32> ; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax ; WIDEN_AVX2-NEXT: vextractps $1, %xmm0, (%rax) ; WIDEN_AVX2-NEXT: retq +; +; PROMOTE_AVX2-LABEL: test_scatter_v2i32_data_index: +; PROMOTE_AVX2: # %bb.0: +; PROMOTE_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 +; PROMOTE_AVX2-NEXT: vpsllq $2, %xmm1, %xmm1 +; PROMOTE_AVX2-NEXT: vmovq %rdi, %xmm3 +; PROMOTE_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3 +; PROMOTE_AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 +; PROMOTE_AVX2-NEXT: vmovmskpd %xmm2, %eax +; PROMOTE_AVX2-NEXT: testb $1, %al +; PROMOTE_AVX2-NEXT: jne .LBB5_1 +; PROMOTE_AVX2-NEXT: # %bb.2: # %else +; PROMOTE_AVX2-NEXT: testb $2, %al +; PROMOTE_AVX2-NEXT: jne .LBB5_3 +; PROMOTE_AVX2-NEXT: .LBB5_4: # %else2 +; PROMOTE_AVX2-NEXT: retq +; PROMOTE_AVX2-NEXT: .LBB5_1: # %cond.store +; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rcx +; PROMOTE_AVX2-NEXT: vmovss %xmm0, (%rcx) +; PROMOTE_AVX2-NEXT: testb $2, %al +; PROMOTE_AVX2-NEXT: je .LBB5_4 +; PROMOTE_AVX2-NEXT: .LBB5_3: # %cond.store1 +; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax +; PROMOTE_AVX2-NEXT: vextractps $1, %xmm0, (%rax) +; PROMOTE_AVX2-NEXT: retq %gep = getelementptr i32, i32 *%base, <2 x i32> %ind call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %gep, i32 4, <2 x i1> %mask) ret void diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll index eef26dfa8c2..269d0b51c7b 100644 --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 --check-prefix=SSE2-PROMOTE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 --check-prefix=SSE2-WIDEN ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 --check-prefix=SSE41-PROMOTE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 --check-prefix=SSE41-WIDEN ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW diff --git a/llvm/test/CodeGen/X86/shrink_vmul-widen.ll b/llvm/test/CodeGen/X86/shrink_vmul-widen.ll new file mode 100644 index 00000000000..75998580074 --- /dev/null +++ b/llvm/test/CodeGen/X86/shrink_vmul-widen.ll @@ -0,0 +1,2553 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 + +@c = external global i32*, align 8 + +; %val1 = load <2 x i8> +; %op1 = zext<2 x i32> %val1 +; %val2 = load <2 x i8> +; %op2 = zext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_2xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm1 +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i8>* + %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 + %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <4 x i8> +; %op1 = zext<4 x i32> %val1 +; %val2 = load <4 x i8> +; %op2 = zext<4 x i32> %val2 +; %rst = mul <4 x i32> %op1, %op2 +; +define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_4xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_4xi8: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_4xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_4xi8: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <4 x i8>* + %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1 + %tmp8 = zext <4 x i8> %wide.load to <4 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <4 x i8>* + %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1 + %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32> + %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <4 x i32>* + store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <8 x i8> +; %op1 = zext<8 x i32> %val1 +; %val2 = load <8 x i8> +; %op2 = zext<8 x i32> %val2 +; %rst = mul <8 x i32> %op1, %op2 +; +define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_8xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: mul_8xi8: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_8xi8: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_8xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_8xi8: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4) +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_8xi8: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <8 x i8>* + %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1 + %tmp8 = zext <8 x i8> %wide.load to <8 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <8 x i8>* + %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1 + %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32> + %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <8 x i32>* + store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <16 x i8> +; %op1 = zext<16 x i32> %val1 +; %val2 = load <16 x i8> +; %op2 = zext<16 x i32> %val2 +; %rst = mul <16 x i32> %op1, %op2 +; +define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_16xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X86-SSE-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm3, %xmm4 +; X86-SSE-NEXT: movdqa %xmm4, %xmm3 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm4, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm3, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: mul_16xi8: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_16xi8: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_16xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X64-SSE-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm3, %xmm4 +; X64-SSE-NEXT: movdqa %xmm4, %xmm3 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm3, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_16xi8: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_16xi8: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) +; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1 + %tmp8 = zext <16 x i8> %wide.load to <16 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <16 x i8>* + %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1 + %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32> + %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <16 x i32>* + store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i16> +; %op1 = zext<2 x i32> %val1 +; %val2 = load <2 x i16> +; %op2 = zext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_2xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = zext <2 x i16> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i16>* + %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 + %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <4 x i16> +; %op1 = zext<4 x i32> %val1 +; %val2 = load <4 x i16> +; %op2 = zext<4 x i32> %val2 +; %rst = mul <4 x i32> %op1, %op2 +; +define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_4xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_4xi16: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_4xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_4xi16: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <4 x i16>* + %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1 + %tmp8 = zext <4 x i16> %wide.load to <4 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <4 x i16>* + %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1 + %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32> + %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <4 x i32>* + store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <8 x i16> +; %op1 = zext<8 x i32> %val1 +; %val2 = load <8 x i16> +; %op2 = zext<8 x i32> %val2 +; %rst = mul <8 x i32> %op1, %op2 +; +define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_8xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: mul_8xi16: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_8xi16: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_8xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_8xi16: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4) +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_8xi16: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <8 x i16>* + %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1 + %tmp8 = zext <8 x i16> %wide.load to <8 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <8 x i16>* + %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1 + %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32> + %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <8 x i32>* + store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <16 x i16> +; %op1 = zext<16 x i32> %val1 +; %val2 = load <16 x i16> +; %op2 = zext<16 x i32> %val2 +; %rst = mul <16 x i32> %op1, %op2 +; +define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_16xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 +; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 +; X86-SSE-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4 +; X86-SSE-NEXT: pmullw %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X86-SSE-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4 +; X86-SSE-NEXT: pmullw %xmm1, %xmm3 +; X86-SSE-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: mul_16xi16: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_16xi16: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_16xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 +; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 +; X64-SSE-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4 +; X64-SSE-NEXT: pmullw %xmm0, %xmm2 +; X64-SSE-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X64-SSE-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4 +; X64-SSE-NEXT: pmullw %xmm1, %xmm3 +; X64-SSE-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_16xi16: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_16xi16: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) +; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <16 x i16>* + %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 + %tmp8 = zext <16 x i16> %wide.load to <16 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <16 x i16>* + %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 + %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32> + %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <16 x i32>* + store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i8> +; %op1 = sext<2 x i32> %val1 +; %val2 = load <2 x i8> +; %op2 = sext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_2xi8_sext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm1 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_sext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpmovsxbd %xmm1, %xmm1 +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_sext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm1 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_sext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm1 +; X64-AVX-NEXT: vpmovsxbd %xmm1, %xmm1 +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i8>* + %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 + %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i8> +; %op1 = sext<2 x i32> %val1 +; %val2 = load <2 x i8> +; %op2 = zext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_2xi8_sext_zext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_sext_zext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_sext_zext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_sext_zext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm1 +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i8>* + %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 + %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i16> +; %op1 = sext<2 x i32> %val1 +; %val2 = load <2 x i16> +; %op2 = sext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_2xi16_sext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_sext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovsxwd %xmm1, %xmm1 +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_sext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_sext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovsxwd %xmm1, %xmm1 +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i16>* + %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 + %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i16> +; %op1 = sext<2 x i32> %val1 +; %val2 = load <2 x i16> +; %op2 = zext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_2xi16_sext_zext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_sext_zext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_sext_zext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm0 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_sext_zext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i16>* + %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 + %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <16 x i16> +; %op1 = sext<16 x i32> %val1 +; %val2 = load <16 x i16> +; %op2 = sext<16 x i32> %val2 +; %rst = mul <16 x i32> %op1, %op2 +; +define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_16xi16_sext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 +; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 +; X86-SSE-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE-NEXT: pmulhw %xmm0, %xmm4 +; X86-SSE-NEXT: pmullw %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X86-SSE-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm4 +; X86-SSE-NEXT: pmullw %xmm1, %xmm3 +; X86-SSE-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: mul_16xi16_sext: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm1 +; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm2 +; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm3 +; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_16xi16_sext: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0 +; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1 +; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2 +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2 +; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_16xi16_sext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 +; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 +; X64-SSE-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE-NEXT: pmulhw %xmm0, %xmm4 +; X64-SSE-NEXT: pmullw %xmm0, %xmm2 +; X64-SSE-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X64-SSE-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm4 +; X64-SSE-NEXT: pmullw %xmm1, %xmm3 +; X64-SSE-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_16xi16_sext: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm0 +; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm1 +; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm2 +; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm3 +; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_16xi16_sext: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0 +; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1 +; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2 +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2 +; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) +; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <16 x i16>* + %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 + %tmp8 = sext <16 x i16> %wide.load to <16 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <16 x i16>* + %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 + %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32> + %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <16 x i32>* + store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi8_varconst1: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst1: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst1: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0 +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst1: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255> + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi8_varconst2: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst2: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst2: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst2: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127> + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi8_varconst3: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst3: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst3: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0 +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst3: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256> + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi8_varconst4: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst4: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst4: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst4: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255> + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi8_varconst5: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst5: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst5: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst5: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127> + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi8_varconst6: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst6: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst6: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst6: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-AVX-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128> + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i16> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi16_varconst1: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_varconst1: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst1: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst1: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = zext <2 x i16> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535> + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i16> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi16_varconst2: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_varconst2: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst2: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst2: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767> + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i16> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi16_varconst3: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u> +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_varconst3: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst3: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u> +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst3: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = zext <2 x i16> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536> + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i16> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi16_varconst4: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u> +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_varconst4: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst4: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u> +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst4: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768> + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; +; Illegal Types +; + +define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { +; X86-SSE-LABEL: PR34947: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movdqa (%eax), %xmm5 +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa (%ecx), %xmm2 +; X86-SSE-NEXT: movdqa 16(%ecx), %xmm6 +; X86-SSE-NEXT: pxor %xmm0, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-SSE-NEXT: movdqa %xmm5, %xmm4 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] +; X86-SSE-NEXT: movd %xmm0, %eax +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3] +; X86-SSE-NEXT: movd %xmm0, %esi +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %esi +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1] +; X86-SSE-NEXT: movd %xmm3, %eax +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,0,1] +; X86-SSE-NEXT: movd %xmm3, %esi +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %esi +; X86-SSE-NEXT: movd %edx, %xmm7 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; X86-SSE-NEXT: movd %xmm5, %eax +; X86-SSE-NEXT: movd %xmm6, %esi +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %esi +; X86-SSE-NEXT: movd %edx, %xmm3 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] +; X86-SSE-NEXT: movd %xmm5, %eax +; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3] +; X86-SSE-NEXT: movd %xmm5, %esi +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %esi +; X86-SSE-NEXT: movd %edx, %xmm5 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[3,1,2,3] +; X86-SSE-NEXT: movd %xmm6, %eax +; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3] +; X86-SSE-NEXT: movd %xmm6, %esi +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %esi +; X86-SSE-NEXT: movd %edx, %xmm6 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1] +; X86-SSE-NEXT: movd %xmm7, %eax +; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1] +; X86-SSE-NEXT: movd %xmm7, %esi +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %esi +; X86-SSE-NEXT: movd %edx, %xmm7 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; X86-SSE-NEXT: movd %xmm4, %eax +; X86-SSE-NEXT: movd %xmm2, %esi +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %esi +; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; X86-SSE-NEXT: movd %xmm4, %eax +; X86-SSE-NEXT: movd %edx, %xmm4 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; X86-SSE-NEXT: movd %xmm2, %esi +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %esi +; X86-SSE-NEXT: movd %edx, %xmm2 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] +; X86-SSE-NEXT: pmuludq %xmm1, %xmm4 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; X86-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm0[0,0] +; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm5 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl 32(%ecx) +; X86-SSE-NEXT: movdqa %xmm0, (%eax) +; X86-SSE-NEXT: movdqa %xmm4, (%eax) +; X86-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 +; X86-SSE-NEXT: movl %eax, (%eax) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: PR34947: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: pushl %ebx +; X86-AVX1-NEXT: pushl %edi +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: subl $16, %esp +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vmovd %xmm1, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl 32(%ecx) +; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $3, %xmm2, %eax +; X86-AVX1-NEXT: vmovdqa (%ecx), %xmm1 +; X86-AVX1-NEXT: vmovdqa 16(%ecx), %xmm3 +; X86-AVX1-NEXT: vpextrd $3, %xmm3, %ecx +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $2, %xmm2, %eax +; X86-AVX1-NEXT: vpextrd $2, %xmm3, %ecx +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $1, %xmm2, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm3, %ecx +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-AVX1-NEXT: vmovd %xmm2, %eax +; X86-AVX1-NEXT: vmovd %xmm3, %ecx +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, %ebp +; X86-AVX1-NEXT: vpextrd $3, %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $3, %xmm1, %ecx +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, %ebx +; X86-AVX1-NEXT: vpextrd $2, %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $2, %xmm1, %esi +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %esi +; X86-AVX1-NEXT: movl %edx, %esi +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm1, %edi +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %edi +; X86-AVX1-NEXT: movl %edx, %edi +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vmovd %xmm1, %ecx +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: vmovd %edx, %xmm0 +; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %ebp, %xmm1 +; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: imull $8199, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-AVX1-NEXT: # imm = 0x2007 +; X86-AVX1-NEXT: movl %eax, (%eax) +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vmovdqa %xmm1, (%eax) +; X86-AVX1-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX1-NEXT: addl $16, %esp +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: popl %edi +; X86-AVX1-NEXT: popl %ebx +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: PR34947: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %edi +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vmovdqa (%esi), %xmm2 +; X86-AVX2-NEXT: vmovdqa 16(%esi), %xmm3 +; X86-AVX2-NEXT: vpextrd $1, %xmm3, %ecx +; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; X86-AVX2-NEXT: vpextrd $1, %xmm4, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: movl %edx, %ecx +; X86-AVX2-NEXT: vmovd %xmm3, %edi +; X86-AVX2-NEXT: vmovd %xmm4, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %edi +; X86-AVX2-NEXT: vmovd %edx, %xmm5 +; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5 +; X86-AVX2-NEXT: vpextrd $2, %xmm3, %ecx +; X86-AVX2-NEXT: vpextrd $2, %xmm4, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 +; X86-AVX2-NEXT: vpextrd $3, %xmm3, %ecx +; X86-AVX2-NEXT: vpextrd $3, %xmm4, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3 +; X86-AVX2-NEXT: vpextrd $1, %xmm2, %ecx +; X86-AVX2-NEXT: vpextrd $1, %xmm1, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: movl %edx, %ecx +; X86-AVX2-NEXT: vmovd %xmm2, %edi +; X86-AVX2-NEXT: vmovd %xmm1, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %edi +; X86-AVX2-NEXT: vmovd %edx, %xmm4 +; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4 +; X86-AVX2-NEXT: vpextrd $2, %xmm2, %ecx +; X86-AVX2-NEXT: vpextrd $2, %xmm1, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 +; X86-AVX2-NEXT: vpextrd $3, %xmm2, %ecx +; X86-AVX2-NEXT: vpextrd $3, %xmm1, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1 +; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl 32(%esi) +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199] +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007 +; X86-AVX2-NEXT: movl %eax, (%eax) +; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: popl %edi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: PR34947: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa (%rdi), %xmm5 +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa (%rsi), %xmm2 +; X64-SSE-NEXT: movdqa 16(%rsi), %xmm6 +; X64-SSE-NEXT: pxor %xmm0, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-SSE-NEXT: movdqa %xmm5, %xmm3 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] +; X64-SSE-NEXT: movd %xmm0, %eax +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3] +; X64-SSE-NEXT: movd %xmm0, %ecx +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm8 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1] +; X64-SSE-NEXT: movd %xmm4, %eax +; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,0,1] +; X64-SSE-NEXT: movd %xmm4, %ecx +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm7 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; X64-SSE-NEXT: movd %xmm5, %eax +; X64-SSE-NEXT: movd %xmm6, %ecx +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm4 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] +; X64-SSE-NEXT: movd %xmm5, %eax +; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3] +; X64-SSE-NEXT: movd %xmm5, %ecx +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm5 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[3,1,2,3] +; X64-SSE-NEXT: movd %xmm6, %eax +; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3] +; X64-SSE-NEXT: movd %xmm6, %ecx +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm6 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1] +; X64-SSE-NEXT: movd %xmm7, %eax +; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1] +; X64-SSE-NEXT: movd %xmm7, %ecx +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm7 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; X64-SSE-NEXT: movd %xmm3, %eax +; X64-SSE-NEXT: movd %xmm2, %ecx +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; X64-SSE-NEXT: movd %xmm3, %eax +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; X64-SSE-NEXT: movd %xmm2, %ecx +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm2 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl 32(%rsi) +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0] +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-SSE-NEXT: pmuludq %xmm1, %xmm4 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; X64-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm8[0,0] +; X64-SSE-NEXT: pmuludq %xmm1, %xmm5 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 +; X64-SSE-NEXT: movl %eax, (%rax) +; X64-SSE-NEXT: movdqa %xmm2, (%rax) +; X64-SSE-NEXT: movdqa %xmm0, (%rax) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: PR34947: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: pushq %rbp +; X64-AVX1-NEXT: pushq %rbx +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vmovd %xmm1, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl 32(%rsi) +; X64-AVX1-NEXT: movl %edx, %r8d +; X64-AVX1-NEXT: vpextrd $3, %xmm2, %eax +; X64-AVX1-NEXT: vmovdqa (%rsi), %xmm1 +; X64-AVX1-NEXT: vmovdqa 16(%rsi), %xmm3 +; X64-AVX1-NEXT: vpextrd $3, %xmm3, %ecx +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %r9d +; X64-AVX1-NEXT: vpextrd $2, %xmm2, %eax +; X64-AVX1-NEXT: vpextrd $2, %xmm3, %ecx +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %r10d +; X64-AVX1-NEXT: vpextrd $1, %xmm2, %eax +; X64-AVX1-NEXT: vpextrd $1, %xmm3, %ecx +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %r11d +; X64-AVX1-NEXT: vmovd %xmm2, %eax +; X64-AVX1-NEXT: vmovd %xmm3, %ecx +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %esi +; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax +; X64-AVX1-NEXT: vpextrd $3, %xmm1, %ecx +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %edi +; X64-AVX1-NEXT: vpextrd $2, %xmm0, %eax +; X64-AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %ecx +; X64-AVX1-NEXT: vpextrd $1, %xmm0, %eax +; X64-AVX1-NEXT: vpextrd $1, %xmm1, %ebx +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ebx +; X64-AVX1-NEXT: movl %edx, %ebx +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vmovd %xmm1, %ebp +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ebp +; X64-AVX1-NEXT: vmovd %edx, %xmm0 +; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %esi, %xmm2 +; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: imull $8199, %r8d, %eax # imm = 0x2007 +; X64-AVX1-NEXT: movl %eax, (%rax) +; X64-AVX1-NEXT: vmovdqa %xmm1, (%rax) +; X64-AVX1-NEXT: vmovdqa %xmm0, (%rax) +; X64-AVX1-NEXT: popq %rbx +; X64-AVX1-NEXT: popq %rbp +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: PR34947: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vmovdqa (%rsi), %xmm2 +; X64-AVX2-NEXT: vmovdqa 16(%rsi), %xmm3 +; X64-AVX2-NEXT: vpextrd $1, %xmm3, %ecx +; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; X64-AVX2-NEXT: vpextrd $1, %xmm4, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: movl %edx, %ecx +; X64-AVX2-NEXT: vmovd %xmm3, %edi +; X64-AVX2-NEXT: vmovd %xmm4, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %edi +; X64-AVX2-NEXT: vmovd %edx, %xmm5 +; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5 +; X64-AVX2-NEXT: vpextrd $2, %xmm3, %ecx +; X64-AVX2-NEXT: vpextrd $2, %xmm4, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 +; X64-AVX2-NEXT: vpextrd $3, %xmm3, %ecx +; X64-AVX2-NEXT: vpextrd $3, %xmm4, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3 +; X64-AVX2-NEXT: vpextrd $1, %xmm2, %ecx +; X64-AVX2-NEXT: vpextrd $1, %xmm1, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: movl %edx, %ecx +; X64-AVX2-NEXT: vmovd %xmm2, %edi +; X64-AVX2-NEXT: vmovd %xmm1, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %edi +; X64-AVX2-NEXT: vmovd %edx, %xmm4 +; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4 +; X64-AVX2-NEXT: vpextrd $2, %xmm2, %ecx +; X64-AVX2-NEXT: vpextrd $2, %xmm1, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 +; X64-AVX2-NEXT: vpextrd $3, %xmm2, %ecx +; X64-AVX2-NEXT: vpextrd $3, %xmm1, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1 +; X64-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl 32(%rsi) +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199] +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007 +; X64-AVX2-NEXT: movl %eax, (%rax) +; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %a0 = load <9 x i16>, <9 x i16>* %p0, align 64 + %a1 = load <9 x i32>, <9 x i32>* %p1, align 64 + %ext0 = zext <9 x i16> %a0 to <9 x i32> + %rem = urem <9 x i32> %ext0, %a1 + %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem + store <9 x i32> %mul, <9 x i32>* undef, align 64 + ret void +} diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll new file mode 100644 index 00000000000..fae69ea5d80 --- /dev/null +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll @@ -0,0 +1,574 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL + +; PR31551 +; Pairs of shufflevector:trunc functions with functional equivalence. +; Ideally, the shuffles should be lowered to code with the same quality as the truncates. + +define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind { +; SSE2-LABEL: shuffle_v16i8_to_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rsi) +; SSE2-NEXT: retq +; +; SSE42-LABEL: shuffle_v16i8_to_v8i8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE42-NEXT: movq %xmm0, (%rsi) +; SSE42-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_to_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512-LABEL: shuffle_v16i8_to_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: retq + %vec = load <16 x i8>, <16 x i8>* %L + %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + store <8 x i8> %strided.vec, <8 x i8>* %S + ret void +} + +define void @trunc_v8i16_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind { +; SSE2-LABEL: trunc_v8i16_to_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rsi) +; SSE2-NEXT: retq +; +; SSE42-LABEL: trunc_v8i16_to_v8i8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE42-NEXT: movq %xmm0, (%rsi) +; SSE42-NEXT: retq +; +; AVX-LABEL: trunc_v8i16_to_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v8i16_to_v8i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v8i16_to_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v8i16_to_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v8i16_to_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <16 x i8>, <16 x i8>* %L + %bc = bitcast <16 x i8> %vec to <8 x i16> + %strided.vec = trunc <8 x i16> %bc to <8 x i8> + store <8 x i8> %strided.vec, <8 x i8>* %S + ret void +} + +define void @shuffle_v8i16_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind { +; SSE2-LABEL: shuffle_v8i16_to_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: movq %xmm0, (%rsi) +; SSE2-NEXT: retq +; +; SSE42-LABEL: shuffle_v8i16_to_v4i16: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE42-NEXT: movq %xmm0, (%rsi) +; SSE42-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_to_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512-LABEL: shuffle_v8i16_to_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: retq + %vec = load <8 x i16>, <8 x i16>* %L + %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + store <4 x i16> %strided.vec, <4 x i16>* %S + ret void +} + +define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind { +; SSE2-LABEL: trunc_v4i32_to_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: movq %xmm0, (%rsi) +; SSE2-NEXT: retq +; +; SSE42-LABEL: trunc_v4i32_to_v4i16: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE42-NEXT: movq %xmm0, (%rsi) +; SSE42-NEXT: retq +; +; AVX-LABEL: trunc_v4i32_to_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v4i32_to_v4i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i32_to_v4i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i32_to_v4i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i32_to_v4i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <8 x i16>, <8 x i16>* %L + %bc = bitcast <8 x i16> %vec to <4 x i32> + %strided.vec = trunc <4 x i32> %bc to <4 x i16> + store <4 x i16> %strided.vec, <4 x i16>* %S + ret void +} + +define void @shuffle_v4i32_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind { +; SSE-LABEL: shuffle_v4i32_to_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; SSE-NEXT: movq %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_to_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX-NEXT: vmovlps %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512-LABEL: shuffle_v4i32_to_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512-NEXT: vmovlps %xmm0, (%rsi) +; AVX512-NEXT: retq + %vec = load <4 x i32>, <4 x i32>* %L + %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 0, i32 2> + store <2 x i32> %strided.vec, <2 x i32>* %S + ret void +} + +define void @trunc_v2i64_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind { +; SSE-LABEL: trunc_v2i64_to_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; SSE-NEXT: movq %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX-LABEL: trunc_v2i64_to_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX-NEXT: vmovlps %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v2i64_to_v2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512F-NEXT: vmovlps %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v2i64_to_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v2i64_to_v2i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <4 x i32>, <4 x i32>* %L + %bc = bitcast <4 x i32> %vec to <2 x i64> + %strided.vec = trunc <2 x i64> %bc to <2 x i32> + store <2 x i32> %strided.vec, <2 x i32>* %S + ret void +} + +define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind { +; SSE2-LABEL: shuffle_v16i8_to_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movd %xmm0, (%rsi) +; SSE2-NEXT: retq +; +; SSE42-LABEL: shuffle_v16i8_to_v4i8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE42-NEXT: movd %xmm0, (%rsi) +; SSE42-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_to_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512-LABEL: shuffle_v16i8_to_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovd %xmm0, (%rsi) +; AVX512-NEXT: retq + %vec = load <16 x i8>, <16 x i8>* %L + %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> + store <4 x i8> %strided.vec, <4 x i8>* %S + ret void +} + +define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind { +; SSE2-LABEL: trunc_v4i32_to_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movd %xmm0, (%rsi) +; SSE2-NEXT: retq +; +; SSE42-LABEL: trunc_v4i32_to_v4i8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE42-NEXT: movd %xmm0, (%rsi) +; SSE42-NEXT: retq +; +; AVX-LABEL: trunc_v4i32_to_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v4i32_to_v4i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vmovd %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i32_to_v4i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i32_to_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i32_to_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <16 x i8>, <16 x i8>* %L + %bc = bitcast <16 x i8> %vec to <4 x i32> + %strided.vec = trunc <4 x i32> %bc to <4 x i8> + store <4 x i8> %strided.vec, <4 x i8>* %S + ret void +} + +define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind { +; SSE-LABEL: shuffle_v8i16_to_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: movd %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: shuffle_v8i16_to_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vmovd %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: shuffle_v8i16_to_v2i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vmovd %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i16_to_v2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vmovd %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v8i16_to_v2i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <8 x i16>, <8 x i16>* %L + %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 0, i32 4> + store <2 x i16> %strided.vec, <2 x i16>* %S + ret void +} + +define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind { +; SSE-LABEL: trunc_v2i64_to_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: movd %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_v2i64_to_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vmovd %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_v2i64_to_v2i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_v2i64_to_v2i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vmovd %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v2i64_to_v2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v2i64_to_v2i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v2i64_to_v2i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <8 x i16>, <8 x i16>* %L + %bc = bitcast <8 x i16> %vec to <2 x i64> + %strided.vec = trunc <2 x i64> %bc to <2 x i16> + store <2 x i16> %strided.vec, <2 x i16>* %S + ret void +} + +define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind { +; SSE2-LABEL: shuffle_v16i8_to_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movw %ax, (%rsi) +; SSE2-NEXT: retq +; +; SSE42-LABEL: shuffle_v16i8_to_v2i8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) +; SSE42-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_to_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512-LABEL: shuffle_v16i8_to_v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512-NEXT: retq + %vec = load <16 x i8>, <16 x i8>* %L + %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 0, i32 8> + store <2 x i8> %strided.vec, <2 x i8>* %S + ret void +} + +define void @trunc_v2i64_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind { +; SSE2-LABEL: trunc_v2i64_to_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movw %ax, (%rsi) +; SSE2-NEXT: retq +; +; SSE42-LABEL: trunc_v2i64_to_v2i8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) +; SSE42-NEXT: retq +; +; AVX-LABEL: trunc_v2i64_to_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v2i64_to_v2i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v2i64_to_v2i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v2i64_to_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v2i64_to_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <16 x i8>, <16 x i8>* %L + %bc = bitcast <16 x i8> %vec to <2 x i64> + %strided.vec = trunc <2 x i64> %bc to <2 x i8> + store <2 x i8> %strided.vec, <2 x i8>* %S + ret void +} diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll new file mode 100644 index 00000000000..ef5b8665e16 --- /dev/null +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll @@ -0,0 +1,1454 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL + +; PR31551 +; Pairs of shufflevector:trunc functions with functional equivalence. +; Ideally, the shuffles should be lowered to code with the same quality as the truncates. + +define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { +; AVX-LABEL: shuffle_v32i8_to_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: shuffle_v32i8_to_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_to_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v32i8_to_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v16i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1 +; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: retq + %vec = load <32 x i8>, <32 x i8>* %L + %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> + store <16 x i8> %strided.vec, <16 x i8>* %S + ret void +} + +define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { +; AVX1-LABEL: trunc_v16i16_to_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_v16i16_to_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_v16i16_to_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v16i16_to_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v16i16_to_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v16i16_to_v16i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vpmovwb %ymm0, (%rsi) +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %vec = load <32 x i8>, <32 x i8>* %L + %bc = bitcast <32 x i8> %vec to <16 x i16> + %strided.vec = trunc <16 x i16> %bc to <16 x i8> + store <16 x i8> %strided.vec, <16 x i8>* %S + ret void +} + +define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { +; AVX-LABEL: shuffle_v16i16_to_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: shuffle_v16i16_to_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i16_to_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v16i16_to_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14] +; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v8i16: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14] +; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: retq + %vec = load <16 x i16>, <16 x i16>* %L + %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + store <8 x i16> %strided.vec, <8 x i16>* %S + ret void +} + +define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { +; AVX1-LABEL: trunc_v8i32_to_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_v8i32_to_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_v8i32_to_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v8i32_to_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v8i32_to_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i16: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vpmovdw %ymm0, (%rsi) +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %vec = load <16 x i16>, <16 x i16>* %L + %bc = bitcast <16 x i16> %vec to <8 x i32> + %strided.vec = trunc <8 x i32> %bc to <8 x i16> + store <8 x i16> %strided.vec, <8 x i16>* %S + ret void +} + +define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { +; AVX-LABEL: shuffle_v8i32_to_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] +; AVX-NEXT: vmovaps %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512-LABEL: shuffle_v8i32_to_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovaps (%rdi), %xmm0 +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] +; AVX512-NEXT: vmovaps %xmm0, (%rsi) +; AVX512-NEXT: retq + %vec = load <8 x i32>, <8 x i32>* %L + %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + store <4 x i32> %strided.vec, <4 x i32>* %S + ret void +} + +define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { +; AVX1-LABEL: trunc_v4i64_to_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] +; AVX1-NEXT: vmovaps %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] +; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps (%rdi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsi) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_v4i64_to_v4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_to_v4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_to_v4i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i32: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vpmovqd %ymm0, (%rsi) +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %vec = load <8 x i32>, <8 x i32>* %L + %bc = bitcast <8 x i32> %vec to <4 x i64> + %strided.vec = trunc <4 x i64> %bc to <4 x i32> + store <4 x i32> %strided.vec, <4 x i32>* %S + ret void +} + +define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { +; AVX-LABEL: shuffle_v32i8_to_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: shuffle_v32i8_to_v8i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_to_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v32i8_to_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2024390091656922112,2024390091656922112] +; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1 +; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: retq + %vec = load <32 x i8>, <32 x i8>* %L + %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> + store <8 x i8> %strided.vec, <8 x i8>* %S + ret void +} + +define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { +; AVX-LABEL: trunc_v8i32_to_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v8i32_to_v8i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v8i32_to_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v8i32_to_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %vec = load <32 x i8>, <32 x i8>* %L + %bc = bitcast <32 x i8> %vec to <8 x i32> + %strided.vec = trunc <8 x i32> %bc to <8 x i8> + store <8 x i8> %strided.vec, <8 x i8>* %S + ret void +} + +define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind { +; IR generated from: +; return (__m128i) {(long long)__builtin_convertvector((__v8si)__A, __v8qi), 0}; +; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %truncated.vec = trunc <8 x i32> %vec to <8 x i8> + %bc = bitcast <8 x i8> %truncated.vec to i64 + %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0 + ret <2 x i64> %result +} + +define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nounwind { +; AVX1-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %truncated = trunc <8 x i32> %vec to <8 x i8> + %truncated.ext = zext <8 x i8> %truncated to <8 x i16> + %bc = bitcast <8 x i16> %truncated.ext to <16 x i8> + %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + ret <16 x i8> %result +} + +define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nounwind { +; AVX1-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %truncated = trunc <8 x i32> %vec to <8 x i16> + %bc = bitcast <8 x i16> %truncated to <16 x i8> + %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 17, i32 20, i32 24, i32 22, i32 31, i32 28, i32 28, i32 29> + ret <16 x i8> %result +} + +define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind { +; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %truncated = trunc <8 x i32> %vec to <8 x i8> + %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %result +} + +define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind { +; IR generated from: +; return (__m128i) {(long long)__builtin_convertvector((__v4di)x, __v4hi), 0}; +; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v2i64: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v2i64: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %truncated = trunc <4 x i64> %vec to <4 x i16> + %bc = bitcast <4 x i16> %truncated to i64 + %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0 + ret <2 x i64> %result +} + +define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) nounwind { +; AVX1-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %truncated = trunc <4 x i64> %vec to <4 x i16> + %truncated.ext = zext <4 x i16> %truncated to <4 x i32> + %bc = bitcast <4 x i32> %truncated.ext to <8 x i16> + %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> + ret <8 x i16> %result +} + +define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) nounwind { +; AVX1-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %truncated = trunc <4 x i64> %vec to <4 x i32> + %bc = bitcast <4 x i32> %truncated to <8 x i16> + %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 8, i32 undef, i32 13> + ret <8 x i16> %result +} + +define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind { +; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %truncated = trunc <4 x i64> %vec to <4 x i16> + %result = shufflevector <4 x i16> %truncated, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i16> %result +} + +define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind { +; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VBMIVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %truncated = trunc <4 x i64> %vec to <4 x i8> + %result = shufflevector <4 x i8> %truncated, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 5, i32 5, i32 undef, i32 7> + ret <16 x i8> %result +} + +define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { +; AVX1-LABEL: shuffle_v16i16_to_v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-NEXT: vmovq %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: shuffle_v16i16_to_v4i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i16_to_v4i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v16i16_to_v4i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13] +; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi) +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13] +; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: retq + %vec = load <16 x i16>, <16 x i16>* %L + %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> + store <4 x i16> %strided.vec, <4 x i16>* %S + ret void +} + +define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { +; AVX1-LABEL: trunc_v4i64_to_v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-NEXT: vmovq %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_v4i64_to_v4i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_to_v4i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_to_v4i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %vec = load <16 x i16>, <16 x i16>* %L + %bc = bitcast <16 x i16> %vec to <4 x i64> + %strided.vec = trunc <4 x i64> %bc to <4 x i16> + store <4 x i16> %strided.vec, <4 x i16>* %S + ret void +} + +define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { +; AVX-LABEL: shuffle_v32i8_to_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: shuffle_v32i8_to_v4i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vmovd %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_to_v4i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-NEXT: vmovd %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v32i8_to_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [403703808,403703808,403703808,403703808] +; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1 +; AVX512VBMIVL-NEXT: vmovd %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: retq + %vec = load <32 x i8>, <32 x i8>* %L + %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24> + store <4 x i8> %strided.vec, <4 x i8>* %S + ret void +} + +define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { +; AVX-LABEL: trunc_v4i64_to_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v4i64_to_v4i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_to_v4i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_to_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vpmovqb %ymm0, (%rsi) +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %vec = load <32 x i8>, <32 x i8>* %L + %bc = bitcast <32 x i8> %vec to <4 x i64> + %strided.vec = trunc <4 x i64> %bc to <4 x i8> + store <4 x i8> %strided.vec, <4 x i8>* %S + ret void +} + +; In this case not all elements are collected from the same source vector, so +; the resulting BUILD_VECTOR should not be combined to a truncate. +define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { +; AVX1-LABEL: negative: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: negative: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: negative: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: negative: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: negative: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: negative: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] +; AVX512BWVL-NEXT: movl $65537, %eax # imm = 0x10001 +; AVX512BWVL-NEXT: kmovd %eax, %k1 +; AVX512BWVL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: negative: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,48,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] +; AVX512VBMIVL-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> + %w0 = extractelement <32 x i8> %w, i32 0 + %merged = insertelement <16 x i8> %strided.vec, i8 %w0, i32 0 + ret <16 x i8> %merged +} diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll new file mode 100644 index 00000000000..f9eae50039c --- /dev/null +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll @@ -0,0 +1,903 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL + +; PR31551 +; Pairs of shufflevector:trunc functions with functional equivalence. +; Ideally, the shuffles should be lowered to code with the same quality as the truncates. + +define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind { +; AVX512F-LABEL: shuffle_v64i8_to_v32i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v64i8_to_v32i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] +; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v64i8_to_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] +; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512VBMI-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512VBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VBMI-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512VBMI-NEXT: vzeroupper +; AVX512VBMI-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %vec = load <64 x i8>, <64 x i8>* %L + %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62> + store <32 x i8> %strided.vec, <32 x i8>* %S + ret void +} + +define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind { +; AVX512F-LABEL: trunc_v32i16_to_v32i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi) +; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v32i16_to_v32i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi) +; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v32i16_to_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi) +; AVX512VBMI-NEXT: vzeroupper +; AVX512VBMI-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi) +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %vec = load <64 x i8>, <64 x i8>* %L + %bc = bitcast <64 x i8> %vec to <32 x i16> + %strided.vec = trunc <32 x i16> %bc to <32 x i8> + store <32 x i8> %strided.vec, <32 x i8>* %S + ret void +} + +define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind { +; AVX512F-LABEL: shuffle_v32i16_to_v16i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-NEXT: vmovaps %ymm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i16_to_v16i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] +; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v32i16_to_v16i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31] +; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512BW-NEXT: vmovaps %ymm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMI-LABEL: shuffle_v32i16_to_v16i16: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31] +; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi) +; AVX512VBMI-NEXT: vzeroupper +; AVX512VBMI-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v16i16: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %vec = load <32 x i16>, <32 x i16>* %L + %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> + store <16 x i16> %strided.vec, <16 x i16>* %S + ret void +} + +define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind { +; AVX512-LABEL: trunc_v16i32_to_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %vec = load <32 x i16>, <32 x i16>* %L + %bc = bitcast <32 x i16> %vec to <16 x i32> + %strided.vec = trunc <16 x i32> %bc to <16 x i16> + store <16 x i16> %strided.vec, <16 x i16>* %S + ret void +} + +define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind { +; AVX512F-LABEL: shuffle_v16i32_to_v8i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %ymm0 +; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-NEXT: vmovaps %ymm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i32_to_v8i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14] +; AVX512VL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 +; AVX512VL-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v16i32_to_v8i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovaps (%rdi), %ymm0 +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6] +; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512BW-NEXT: vmovaps %ymm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14] +; AVX512BWVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMI-LABEL: shuffle_v16i32_to_v8i32: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: vmovaps (%rdi), %ymm0 +; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6] +; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi) +; AVX512VBMI-NEXT: vzeroupper +; AVX512VBMI-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v16i32_to_v8i32: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14] +; AVX512VBMIVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %vec = load <16 x i32>, <16 x i32>* %L + %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + store <8 x i32> %strided.vec, <8 x i32>* %S + ret void +} + +define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind { +; AVX512-LABEL: trunc_v8i64_to_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %vec = load <16 x i32>, <16 x i32>* %L + %bc = bitcast <16 x i32> %vec to <8 x i64> + %strided.vec = trunc <8 x i64> %bc to <8 x i32> + store <8 x i32> %strided.vec, <8 x i32>* %S + ret void +} + +define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind { +; AVX512F-LABEL: shuffle_v64i8_to_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v64i8_to_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v64i8_to_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq +; +; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u> +; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VBMI-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VBMIVL-NEXT: vpermt2b 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %vec = load <64 x i8>, <64 x i8>* %L + %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> + store <16 x i8> %strided.vec, <16 x i8>* %S + ret void +} + +define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind { +; AVX512-LABEL: trunc_v16i32_to_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %vec = load <64 x i8>, <64 x i8>* %L + %bc = bitcast <64 x i8> %vec to <16 x i32> + %strided.vec = trunc <16 x i32> %bc to <16 x i8> + store <16 x i8> %strided.vec, <16 x i8>* %S + ret void +} + +define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind { +; AVX512F-LABEL: shuffle_v32i16_to_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i16_to_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v32i16_to_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28] +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] +; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VBMI-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28] +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %vec = load <32 x i16>, <32 x i16>* %L + %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> + store <8 x i16> %strided.vec, <8 x i16>* %S + ret void +} + +define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind { +; AVX512-LABEL: trunc_v8i64_to_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %vec = load <32 x i16>, <32 x i16>* %L + %bc = bitcast <32 x i16> %vec to <8 x i64> + %strided.vec = trunc <8 x i64> %bc to <8 x i16> + store <8 x i16> %strided.vec, <8 x i16>* %S + ret void +} + +define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { +; AVX512F-LABEL: shuffle_v64i8_to_v8i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v64i8_to_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v64i8_to_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq +; +; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512VBMI-NEXT: vmovq %xmm0, (%rsi) +; AVX512VBMI-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224] +; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %vec = load <64 x i8>, <64 x i8>* %L + %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56> + store <8 x i8> %strided.vec, <8 x i8>* %S + ret void +} + +define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { +; AVX512-LABEL: trunc_v8i64_to_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %vec = load <64 x i8>, <64 x i8>* %L + %bc = bitcast <64 x i8> %vec to <8 x i64> + %strided.vec = trunc <8 x i64> %bc to <8 x i8> + store <8 x i8> %strided.vec, <8 x i8>* %S + ret void +} + +define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) { +; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI-NEXT: vzeroupper +; AVX512VBMI-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0 +; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61> + ret <16 x i8> %res +} + +define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) { +; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] +; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62] +; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI-NEXT: vzeroupper +; AVX512VBMI-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62] +; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0 +; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq + %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62> + ret <16 x i8> %res +} + +define <4 x double> @PR34175(<32 x i16>* %p) { +; AVX512F-LABEL: PR34175: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: PR34175: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,2,3] +; AVX512VL-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: PR34175: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm1 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: PR34175: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768] +; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX512BWVL-NEXT: retq +; +; AVX512VBMI-LABEL: PR34175: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm1 +; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512VBMI-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX512VBMI-NEXT: retq +; +; AVX512VBMIVL-LABEL: PR34175: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768] +; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX512VBMIVL-NEXT: retq + %v = load <32 x i16>, <32 x i16>* %p, align 2 + %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24> + %tofp = uitofp <4 x i16> %shuf to <4 x double> + ret <4 x double> %tofp +} + +define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind { +; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %truncated = trunc <8 x i64> %vec to <8 x i8> + %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %result +} + diff --git a/llvm/test/CodeGen/X86/vec_cast2.ll b/llvm/test/CodeGen/X86/vec_cast2.ll index e43216dcdbd..dad00e8c48d 100644 --- a/llvm/test/CodeGen/X86/vec_cast2.ll +++ b/llvm/test/CodeGen/X86/vec_cast2.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+avx | FileCheck %s +; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE define <8 x float> @cvt_v8i8_v8f32(<8 x i8> %src) { ; CHECK-LABEL: cvt_v8i8_v8f32: @@ -10,6 +11,15 @@ define <8 x float> @cvt_v8i8_v8f32(<8 x i8> %src) { ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v8i8_v8f32: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm1 +; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-WIDE-NEXT: retl %res = sitofp <8 x i8> %src to <8 x float> ret <8 x float> %res } @@ -23,6 +33,15 @@ define <8 x float> @cvt_v8i16_v8f32(<8 x i16> %src) { ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v8i16_v8f32: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vpmovsxwd %xmm0, %xmm1 +; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-WIDE-NEXT: vpmovsxwd %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-WIDE-NEXT: retl %res = sitofp <8 x i16> %src to <8 x float> ret <8 x float> %res } @@ -33,6 +52,12 @@ define <4 x float> @cvt_v4i8_v4f32(<4 x i8> %src) { ; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v4i8_v4f32: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = sitofp <4 x i8> %src to <4 x float> ret <4 x float> %res } @@ -43,6 +68,12 @@ define <4 x float> @cvt_v4i16_v4f32(<4 x i16> %src) { ; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v4i16_v4f32: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vpmovsxwd %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = sitofp <4 x i16> %src to <4 x float> ret <4 x float> %res } @@ -56,6 +87,15 @@ define <8 x float> @cvt_v8u8_v8f32(<8 x i8> %src) { ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v8u8_v8f32: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-WIDE-NEXT: retl %res = uitofp <8 x i8> %src to <8 x float> ret <8 x float> %res } @@ -69,6 +109,15 @@ define <8 x float> @cvt_v8u16_v8f32(<8 x i16> %src) { ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v8u16_v8f32: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-WIDE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-WIDE-NEXT: retl %res = uitofp <8 x i16> %src to <8 x float> ret <8 x float> %res } @@ -79,6 +128,12 @@ define <4 x float> @cvt_v4u8_v4f32(<4 x i8> %src) { ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v4u8_v4f32: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = uitofp <4 x i8> %src to <4 x float> ret <4 x float> %res } @@ -89,6 +144,12 @@ define <4 x float> @cvt_v4u16_v4f32(<4 x i16> %src) { ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v4u16_v4f32: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = uitofp <4 x i16> %src to <4 x float> ret <4 x float> %res } @@ -102,6 +163,15 @@ define <8 x i8> @cvt_v8f32_v8i8(<8 x float> %src) { ; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v8f32_v8i8: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 +; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vzeroupper +; CHECK-WIDE-NEXT: retl %res = fptosi <8 x float> %src to <8 x i8> ret <8 x i8> %res } @@ -114,6 +184,14 @@ define <8 x i16> @cvt_v8f32_v8i16(<8 x float> %src) { ; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v8f32_v8i16: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 +; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vzeroupper +; CHECK-WIDE-NEXT: retl %res = fptosi <8 x float> %src to <8 x i16> ret <8 x i16> %res } @@ -124,6 +202,12 @@ define <4 x i8> @cvt_v4f32_v4i8(<4 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v4f32_v4i8: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; CHECK-WIDE-NEXT: retl %res = fptosi <4 x float> %src to <4 x i8> ret <4 x i8> %res } @@ -134,6 +218,12 @@ define <4 x i16> @cvt_v4f32_v4i16(<4 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v4f32_v4i16: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = fptosi <4 x float> %src to <4 x i16> ret <4 x i16> %res } @@ -147,6 +237,15 @@ define <8 x i8> @cvt_v8f32_v8u8(<8 x float> %src) { ; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v8f32_v8u8: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 +; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vzeroupper +; CHECK-WIDE-NEXT: retl %res = fptoui <8 x float> %src to <8 x i8> ret <8 x i8> %res } @@ -159,6 +258,14 @@ define <8 x i16> @cvt_v8f32_v8u16(<8 x float> %src) { ; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v8f32_v8u16: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 +; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-WIDE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vzeroupper +; CHECK-WIDE-NEXT: retl %res = fptoui <8 x float> %src to <8 x i16> ret <8 x i16> %res } @@ -169,6 +276,12 @@ define <4 x i8> @cvt_v4f32_v4u8(<4 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v4f32_v4u8: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; CHECK-WIDE-NEXT: retl %res = fptoui <4 x float> %src to <4 x i8> ret <4 x i8> %res } @@ -179,6 +292,12 @@ define <4 x i16> @cvt_v4f32_v4u16(<4 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v4f32_v4u16: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = fptoui <4 x float> %src to <4 x i16> ret <4 x i16> %res } diff --git a/llvm/test/CodeGen/X86/vec_cast3.ll b/llvm/test/CodeGen/X86/vec_cast3.ll index 82b8c00c0a2..4148f7eb0f4 100644 --- a/llvm/test/CodeGen/X86/vec_cast3.ll +++ b/llvm/test/CodeGen/X86/vec_cast3.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+avx | FileCheck %s +; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE define <2 x float> @cvt_v2i8_v2f32(<2 x i8> %src) { ; CHECK-LABEL: cvt_v2i8_v2f32: @@ -7,6 +8,12 @@ define <2 x float> @cvt_v2i8_v2f32(<2 x i8> %src) { ; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v2i8_v2f32: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = sitofp <2 x i8> %src to <2 x float> ret <2 x float> %res } @@ -17,6 +24,12 @@ define <2 x float> @cvt_v2i16_v2f32(<2 x i16> %src) { ; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v2i16_v2f32: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vpmovsxwd %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = sitofp <2 x i16> %src to <2 x float> ret <2 x float> %res } @@ -26,6 +39,11 @@ define <2 x float> @cvt_v2i32_v2f32(<2 x i32> %src) { ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v2i32_v2f32: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = sitofp <2 x i32> %src to <2 x float> ret <2 x float> %res } @@ -36,6 +54,12 @@ define <2 x float> @cvt_v2u8_v2f32(<2 x i8> %src) { ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v2u8_v2f32: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = uitofp <2 x i8> %src to <2 x float> ret <2 x float> %res } @@ -46,6 +70,12 @@ define <2 x float> @cvt_v2u16_v2f32(<2 x i16> %src) { ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v2u16_v2f32: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = uitofp <2 x i16> %src to <2 x float> ret <2 x float> %res } @@ -59,6 +89,15 @@ define <2 x float> @cvt_v2u32_v2f32(<2 x i32> %src) { ; CHECK-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v2u32_v2f32: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; CHECK-WIDE-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; CHECK-WIDE-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vcvtpd2ps %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = uitofp <2 x i32> %src to <2 x float> ret <2 x float> %res } @@ -69,6 +108,12 @@ define <2 x i8> @cvt_v2f32_v2i8(<2 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v2f32_v2i8: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; CHECK-WIDE-NEXT: retl %res = fptosi <2 x float> %src to <2 x i8> ret <2 x i8> %res } @@ -79,6 +124,12 @@ define <2 x i16> @cvt_v2f32_v2i16(<2 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v2f32_v2i16: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; CHECK-WIDE-NEXT: retl %res = fptosi <2 x float> %src to <2 x i16> ret <2 x i16> %res } @@ -88,6 +139,11 @@ define <2 x i32> @cvt_v2f32_v2i32(<2 x float> %src) { ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v2f32_v2i32: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = fptosi <2 x float> %src to <2 x i32> ret <2 x i32> %res } @@ -98,6 +154,12 @@ define <2 x i8> @cvt_v2f32_v2u8(<2 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v2f32_v2u8: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; CHECK-WIDE-NEXT: retl %res = fptoui <2 x float> %src to <2 x i8> ret <2 x i8> %res } @@ -108,6 +170,12 @@ define <2 x i16> @cvt_v2f32_v2u16(<2 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v2f32_v2u16: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; CHECK-WIDE-NEXT: retl %res = fptoui <2 x float> %src to <2 x i16> ret <2 x i16> %res } @@ -123,6 +191,17 @@ define <2 x i32> @cvt_v2f32_v2u32(<2 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: cvt_v2f32_v2u32: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; CHECK-WIDE-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 +; CHECK-WIDE-NEXT: vsubps %xmm1, %xmm0, %xmm1 +; CHECK-WIDE-NEXT: vcvttps2dq %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vxorps LCPI11_1, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-WIDE-NEXT: retl %res = fptoui <2 x float> %src to <2 x i32> ret <2 x i32> %res } @@ -135,6 +214,14 @@ define <32 x i8> @PR40146(<4 x i64> %x) { ; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: PR40146: +; CHECK-WIDE: ## %bb.0: +; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-WIDE-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-WIDE-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-WIDE-NEXT: retl %perm = shufflevector <4 x i64> %x, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef> %t1 = bitcast <4 x i64> %perm to <32 x i8> %t2 = shufflevector <32 x i8> %t1, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 0, i32 32, i32 1, i32 32, i32 2, i32 32, i32 3, i32 32, i32 4, i32 32, i32 5, i32 32, i32 6, i32 32, i32 7, i32 32, i32 16, i32 48, i32 17, i32 48, i32 18, i32 48, i32 19, i32 48, i32 20, i32 48, i32 21, i32 48, i32 22, i32 48, i32 23, i32 48> diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll b/llvm/test/CodeGen/X86/vec_clz.ll index c1e7e42ac7e..6e64641f782 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll +++ b/llvm/test/CodeGen/X86/vec_clz.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-experimental-vector-widening-legalization | FileCheck %s declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1 immarg) diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll b/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll new file mode 100644 index 00000000000..9541d3834a4 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll @@ -0,0 +1,2794 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VLDQ +; +; 32-bit tests to make sure we're not doing anything stupid. +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 + +; +; Double to Signed Integer +; + +define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) { +; SSE-LABEL: fptosi_2f64_to_2i64: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptosi_2f64_to_2i64: +; VEX: # %bb.0: +; VEX-NEXT: vcvttsd2si %xmm0, %rax +; VEX-NEXT: vmovq %rax, %xmm1 +; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; VEX-NEXT: vcvttsd2si %xmm0, %rax +; VEX-NEXT: vmovq %rax, %xmm0 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptosi_2f64_to_2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvttsd2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvttsd2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_2f64_to_2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_2f64_to_2i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_2f64_to_2i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2qq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptosi <2 x double> %a to <2 x i64> + ret <2 x i64> %cvt +} + +define <4 x i32> @fptosi_2f64_to_4i32(<2 x double> %a) { +; SSE-LABEL: fptosi_2f64_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f64_to_4i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = fptosi <2 x double> %a to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i32> %ext +} + +define <2 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) { +; SSE-LABEL: fptosi_2f64_to_2i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f64_to_2i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = fptosi <2 x double> %a to <2 x i32> + ret <2 x i32> %cvt +} + +define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) { +; SSE-LABEL: fptosi_4f64_to_2i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_4f64_to_2i32: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %cvt = fptosi <4 x double> %ext to <4 x i32> + ret <4 x i32> %cvt +} + +define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) { +; SSE-LABEL: fptosi_4f64_to_4i64: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptosi_4f64_to_4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vcvttsd2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vcvttsd2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vcvttsd2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vcvttsd2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_4f64_to_4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vcvttsd2si %xmm1, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX2-NEXT: vcvttsd2si %xmm1, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vcvttsd2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vcvttsd2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptosi_4f64_to_4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vcvttsd2si %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512F-NEXT: vcvttsd2si %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vcvttsd2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvttsd2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_4f64_to_4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_4f64_to_4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_4f64_to_4i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2qq %ymm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptosi <4 x double> %a to <4 x i64> + ret <4 x i64> %cvt +} + +define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) { +; SSE-LABEL: fptosi_4f64_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_4f64_to_4i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %cvt = fptosi <4 x double> %a to <4 x i32> + ret <4 x i32> %cvt +} + +; +; Double to Unsigned Integer +; + +define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) { +; SSE-LABEL: fptoui_2f64_to_2i64: +; SSE: # %bb.0: +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: subsd %xmm2, %xmm1 +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttsd2si %xmm0, %rdx +; SSE-NEXT: ucomisd %xmm2, %xmm0 +; SSE-NEXT: cmovaeq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movapd %xmm0, %xmm3 +; SSE-NEXT: subsd %xmm2, %xmm3 +; SSE-NEXT: cvttsd2si %xmm3, %rax +; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttsd2si %xmm0, %rcx +; SSE-NEXT: ucomisd %xmm2, %xmm0 +; SSE-NEXT: cmovaeq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptoui_2f64_to_2i64: +; VEX: # %bb.0: +; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2 +; VEX-NEXT: vcvttsd2si %xmm2, %rax +; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vcvttsd2si %xmm0, %rdx +; VEX-NEXT: vucomisd %xmm1, %xmm0 +; VEX-NEXT: cmovaeq %rax, %rdx +; VEX-NEXT: vmovq %rdx, %xmm2 +; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3 +; VEX-NEXT: vcvttsd2si %xmm3, %rax +; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vcvttsd2si %xmm0, %rcx +; VEX-NEXT: vucomisd %xmm1, %xmm0 +; VEX-NEXT: cmovaeq %rax, %rcx +; VEX-NEXT: vmovq %rcx, %xmm0 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f64_to_2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_2f64_to_2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f64_to_2i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_2f64_to_2i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2uqq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <2 x double> %a to <2 x i64> + ret <2 x i64> %cvt +} + +define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) { +; SSE-LABEL: fptoui_2f64_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rcx +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_2f64_to_4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3 +; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_2f64_to_4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpackssdw %xmm0, %xmm2, %xmm2 +; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f64_to_4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_2f64_to_4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2udq %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f64_to_4i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_2f64_to_4i32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <2 x double> %a to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i32> %ext +} + +define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) { +; SSE-LABEL: fptoui_2f64_to_2i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_2f64_to_2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3 +; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_2f64_to_2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f64_to_2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_2f64_to_2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2udq %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f64_to_2i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_2f64_to_2i32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <2 x double> %a to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + ret <4 x i32> %ext +} + +define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { +; SSE-LABEL: fptoui_4f64_to_2i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_4f64_to_2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd %xmm0, %xmm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm1 +; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1 +; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_4f64_to_2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovapd %xmm0, %xmm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_4f64_to_2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps %xmm0, %xmm0 +; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_4f64_to_2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovaps %xmm0, %xmm0 +; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_4f64_to_2i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_4f64_to_2i32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %ext = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %cvt = fptoui <4 x double> %ext to <4 x i32> + ret <4 x i32> %cvt +} + +define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { +; SSE-LABEL: fptoui_4f64_to_4i64: +; SSE: # %bb.0: +; SSE-NEXT: movapd %xmm0, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: subsd %xmm3, %xmm0 +; SSE-NEXT: cvttsd2si %xmm0, %rcx +; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttsd2si %xmm2, %rdx +; SSE-NEXT: ucomisd %xmm3, %xmm2 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: movapd %xmm2, %xmm4 +; SSE-NEXT: subsd %xmm3, %xmm4 +; SSE-NEXT: cvttsd2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttsd2si %xmm2, %rdx +; SSE-NEXT: ucomisd %xmm3, %xmm2 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: subsd %xmm3, %xmm2 +; SSE-NEXT: cvttsd2si %xmm2, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttsd2si %xmm1, %rdx +; SSE-NEXT: ucomisd %xmm3, %xmm1 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd %xmm1, %xmm4 +; SSE-NEXT: subsd %xmm3, %xmm4 +; SSE-NEXT: cvttsd2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: ucomisd %xmm3, %xmm1 +; SSE-NEXT: cmovaeq %rcx, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_4f64_to_4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vcvttsd2si %xmm3, %rax +; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttsd2si %xmm2, %rdx +; AVX1-NEXT: vucomisd %xmm1, %xmm2 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm4 +; AVX1-NEXT: vcvttsd2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttsd2si %xmm2, %rdx +; AVX1-NEXT: vucomisd %xmm1, %xmm2 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vcvttsd2si %xmm3, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttsd2si %xmm0, %rdx +; AVX1-NEXT: vucomisd %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vcvttsd2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttsd2si %xmm0, %rcx +; AVX1-NEXT: vucomisd %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_4f64_to_4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vcvttsd2si %xmm3, %rax +; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttsd2si %xmm2, %rdx +; AVX2-NEXT: vucomisd %xmm1, %xmm2 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm4 +; AVX2-NEXT: vcvttsd2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttsd2si %xmm2, %rdx +; AVX2-NEXT: vucomisd %xmm1, %xmm2 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm3 +; AVX2-NEXT: vcvttsd2si %xmm3, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttsd2si %xmm0, %rdx +; AVX2-NEXT: vucomisd %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm4 +; AVX2-NEXT: vcvttsd2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttsd2si %xmm0, %rcx +; AVX2-NEXT: vucomisd %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_4f64_to_4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_4f64_to_4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_4f64_to_4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_4f64_to_4i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2uqq %ymm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <4 x double> %a to <4 x i64> + ret <4 x i64> %cvt +} + +define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { +; SSE-LABEL: fptoui_4f64_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_4f64_to_4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm1 +; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1 +; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_4f64_to_4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_4f64_to_4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_4f64_to_4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_4f64_to_4i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_4f64_to_4i32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <4 x double> %a to <4 x i32> + ret <4 x i32> %cvt +} + +; +; Float to Signed Integer +; + +define <2 x i32> @fptosi_2f32_to_2i32(<2 x float> %a) { +; SSE-LABEL: fptosi_2f32_to_2i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f32_to_2i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = fptosi <2 x float> %a to <2 x i32> + ret <2 x i32> %cvt +} + +define <4 x i32> @fptosi_4f32_to_4i32(<4 x float> %a) { +; SSE-LABEL: fptosi_4f32_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_4f32_to_4i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = fptosi <4 x float> %a to <4 x i32> + ret <4 x i32> %cvt +} + +define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) { +; SSE-LABEL: fptosi_2f32_to_2i64: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptosi_2f32_to_2i64: +; VEX: # %bb.0: +; VEX-NEXT: vcvttss2si %xmm0, %rax +; VEX-NEXT: vmovq %rax, %xmm1 +; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; VEX-NEXT: vcvttss2si %xmm0, %rax +; VEX-NEXT: vmovq %rax, %xmm0 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptosi_2f32_to_2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_2f32_to_2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_2f32_to_2i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_2f32_to_2i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1> + %cvt = fptosi <2 x float> %shuf to <2 x i64> + ret <2 x i64> %cvt +} + +define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) { +; SSE-LABEL: fptosi_4f32_to_2i64: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptosi_4f32_to_2i64: +; VEX: # %bb.0: +; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; VEX-NEXT: vcvttss2si %xmm1, %rax +; VEX-NEXT: vcvttss2si %xmm0, %rcx +; VEX-NEXT: vmovq %rcx, %xmm0 +; VEX-NEXT: vmovq %rax, %xmm1 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptosi_4f32_to_2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2si %xmm1, %rax +; AVX512F-NEXT: vcvttss2si %xmm0, %rcx +; AVX512F-NEXT: vmovq %rcx, %xmm0 +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_4f32_to_2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm1, %rax +; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx +; AVX512VL-NEXT: vmovq %rcx, %xmm0 +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_4f32_to_2i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_4f32_to_2i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0 +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = fptosi <4 x float> %a to <4 x i64> + %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1> + ret <2 x i64> %shuf +} + +define <8 x i32> @fptosi_8f32_to_8i32(<8 x float> %a) { +; SSE-LABEL: fptosi_8f32_to_8i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_8f32_to_8i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-NEXT: retq + %cvt = fptosi <8 x float> %a to <8 x i32> + ret <8 x i32> %cvt +} + +define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) { +; SSE-LABEL: fptosi_4f32_to_4i64: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptosi_4f32_to_4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX1-NEXT: vcvttss2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vcvttss2si %xmm2, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vcvttss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vcvttss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_4f32_to_4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX2-NEXT: vcvttss2si %xmm1, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vcvttss2si %xmm2, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vcvttss2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vcvttss2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptosi_4f32_to_4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvttss2si %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512F-NEXT: vcvttss2si %xmm2, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_4f32_to_4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvttss2si %xmm1, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-NEXT: vcvttss2si %xmm2, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_4f32_to_4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_4f32_to_4i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %cvt = fptosi <4 x float> %shuf to <4 x i64> + ret <4 x i64> %cvt +} + +define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) { +; SSE-LABEL: fptosi_8f32_to_4i64: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptosi_8f32_to_4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX1-NEXT: vcvttss2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vcvttss2si %xmm2, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vcvttss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vcvttss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_8f32_to_4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX2-NEXT: vcvttss2si %xmm1, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vcvttss2si %xmm2, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vcvttss2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vcvttss2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptosi_8f32_to_4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2si %xmm1, %rax +; AVX512F-NEXT: vcvttss2si %xmm0, %rcx +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512F-NEXT: vcvttss2si %xmm1, %rdx +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvttss2si %xmm0, %rsi +; AVX512F-NEXT: vmovq %rsi, %xmm0 +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rcx, %xmm1 +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_8f32_to_4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm1, %rax +; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512VL-NEXT: vcvttss2si %xmm1, %rdx +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvttss2si %xmm0, %rsi +; AVX512VL-NEXT: vmovq %rsi, %xmm0 +; AVX512VL-NEXT: vmovq %rdx, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rcx, %xmm1 +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_8f32_to_4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_8f32_to_4i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512VLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptosi <8 x float> %a to <8 x i64> + %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i64> %shuf +} + +; +; Float to Unsigned Integer +; + +define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) { +; SSE-LABEL: fptoui_2f32_to_2i32: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: cmpltps %xmm2, %xmm1 +; SSE-NEXT: cvttps2dq %xmm0, %xmm3 +; SSE-NEXT: subps %xmm2, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 +; SSE-NEXT: andps %xmm1, %xmm3 +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: orps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_2f32_to_2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_2f32_to_2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorps %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f32_to_2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_2f32_to_2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f32_to_2i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <2 x float> %a to <2 x i32> + ret <2 x i32> %cvt +} + +define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) { +; SSE-LABEL: fptoui_4f32_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: cmpltps %xmm2, %xmm1 +; SSE-NEXT: cvttps2dq %xmm0, %xmm3 +; SSE-NEXT: subps %xmm2, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 +; SSE-NEXT: andps %xmm1, %xmm3 +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: orps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_4f32_to_4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_4f32_to_4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorps %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_4f32_to_4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_4f32_to_4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_4f32_to_4i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_4f32_to_4i32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <4 x float> %a to <4 x i32> + ret <4 x i32> %cvt +} + +define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) { +; SSE-LABEL: fptoui_2f32_to_2i64: +; SSE: # %bb.0: +; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: subss %xmm2, %xmm1 +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttss2si %xmm0, %rdx +; SSE-NEXT: ucomiss %xmm2, %xmm0 +; SSE-NEXT: cmovaeq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: subss %xmm2, %xmm3 +; SSE-NEXT: cvttss2si %xmm3, %rax +; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttss2si %xmm0, %rcx +; SSE-NEXT: ucomiss %xmm2, %xmm0 +; SSE-NEXT: cmovaeq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptoui_2f32_to_2i64: +; VEX: # %bb.0: +; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2 +; VEX-NEXT: vcvttss2si %xmm2, %rax +; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vcvttss2si %xmm0, %rdx +; VEX-NEXT: vucomiss %xmm1, %xmm0 +; VEX-NEXT: cmovaeq %rax, %rdx +; VEX-NEXT: vmovq %rdx, %xmm2 +; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; VEX-NEXT: vcvttss2si %xmm3, %rax +; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vcvttss2si %xmm0, %rcx +; VEX-NEXT: vucomiss %xmm1, %xmm0 +; VEX-NEXT: cmovaeq %rax, %rcx +; VEX-NEXT: vmovq %rcx, %xmm0 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f32_to_2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_2f32_to_2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f32_to_2i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_2f32_to_2i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1> + %cvt = fptoui <2 x float> %shuf to <2 x i64> + ret <2 x i64> %cvt +} + +define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) { +; SSE-LABEL: fptoui_4f32_to_2i64: +; SSE: # %bb.0: +; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: subss %xmm2, %xmm1 +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttss2si %xmm0, %rdx +; SSE-NEXT: ucomiss %xmm2, %xmm0 +; SSE-NEXT: cmovaeq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: subss %xmm2, %xmm3 +; SSE-NEXT: cvttss2si %xmm3, %rax +; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttss2si %xmm0, %rcx +; SSE-NEXT: ucomiss %xmm2, %xmm0 +; SSE-NEXT: cmovaeq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptoui_4f32_to_2i64: +; VEX: # %bb.0: +; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; VEX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; VEX-NEXT: vsubss %xmm2, %xmm1, %xmm3 +; VEX-NEXT: vcvttss2si %xmm3, %rax +; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vcvttss2si %xmm1, %rdx +; VEX-NEXT: vucomiss %xmm2, %xmm1 +; VEX-NEXT: cmovaeq %rax, %rdx +; VEX-NEXT: vsubss %xmm2, %xmm0, %xmm1 +; VEX-NEXT: vcvttss2si %xmm1, %rax +; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vcvttss2si %xmm0, %rcx +; VEX-NEXT: vucomiss %xmm2, %xmm0 +; VEX-NEXT: cmovaeq %rax, %rcx +; VEX-NEXT: vmovq %rcx, %xmm0 +; VEX-NEXT: vmovq %rdx, %xmm1 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptoui_4f32_to_2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2usi %xmm1, %rax +; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx +; AVX512F-NEXT: vmovq %rcx, %xmm0 +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_4f32_to_2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx +; AVX512VL-NEXT: vmovq %rcx, %xmm0 +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_4f32_to_2i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_4f32_to_2i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %ymm0 +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <4 x float> %a to <4 x i64> + %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1> + ret <2 x i64> %shuf +} + +define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) { +; SSE-LABEL: fptoui_8f32_to_8i32: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm4 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: cmpltps %xmm4, %xmm2 +; SSE-NEXT: cvttps2dq %xmm0, %xmm3 +; SSE-NEXT: subps %xmm4, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: xorps %xmm5, %xmm0 +; SSE-NEXT: andps %xmm2, %xmm3 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: orps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: cmpltps %xmm4, %xmm3 +; SSE-NEXT: cvttps2dq %xmm1, %xmm0 +; SSE-NEXT: subps %xmm4, %xmm1 +; SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE-NEXT: xorps %xmm5, %xmm1 +; SSE-NEXT: andps %xmm3, %xmm0 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: orps %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_8f32_to_8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 +; AVX1-NEXT: vsubps %ymm1, %ymm0, %ymm1 +; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 +; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_8f32_to_8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vsubps %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_8f32_to_8i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_8f32_to_8i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2udq %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_8f32_to_8i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_8f32_to_8i32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2udq %ymm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <8 x float> %a to <8 x i32> + ret <8 x i32> %cvt +} + +define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { +; SSE-LABEL: fptoui_4f32_to_4i64: +; SSE: # %bb.0: +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: subss %xmm1, %xmm2 +; SSE-NEXT: cvttss2si %xmm2, %rcx +; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttss2si %xmm0, %rdx +; SSE-NEXT: ucomiss %xmm1, %xmm0 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: subss %xmm1, %xmm4 +; SSE-NEXT: cvttss2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttss2si %xmm3, %rdx +; SSE-NEXT: ucomiss %xmm1, %xmm3 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: subss %xmm1, %xmm4 +; SSE-NEXT: cvttss2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttss2si %xmm3, %rdx +; SSE-NEXT: ucomiss %xmm1, %xmm3 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: subss %xmm1, %xmm4 +; SSE-NEXT: cvttss2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: ucomiss %xmm1, %xmm0 +; SSE-NEXT: cmovaeq %rcx, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_4f32_to_4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm2, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm2 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vcvttss2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm3, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm3 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm0, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vcvttss2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm0, %rcx +; AVX1-NEXT: vucomiss %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_4f32_to_4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vcvttss2si %xmm3, %rax +; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm2, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm2 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4 +; AVX2-NEXT: vcvttss2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm3, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm3 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; AVX2-NEXT: vcvttss2si %xmm3, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm0, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm4 +; AVX2-NEXT: vcvttss2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm0, %rcx +; AVX2-NEXT: vucomiss %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_4f32_to_4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvttss2usi %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512F-NEXT: vcvttss2usi %xmm2, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_4f32_to_4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-NEXT: vcvttss2usi %xmm2, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_4f32_to_4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_4f32_to_4i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %cvt = fptoui <4 x float> %shuf to <4 x i64> + ret <4 x i64> %cvt +} + +define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { +; SSE-LABEL: fptoui_8f32_to_4i64: +; SSE: # %bb.0: +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: subss %xmm1, %xmm2 +; SSE-NEXT: cvttss2si %xmm2, %rcx +; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttss2si %xmm0, %rdx +; SSE-NEXT: ucomiss %xmm1, %xmm0 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: subss %xmm1, %xmm4 +; SSE-NEXT: cvttss2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttss2si %xmm3, %rdx +; SSE-NEXT: ucomiss %xmm1, %xmm3 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: subss %xmm1, %xmm4 +; SSE-NEXT: cvttss2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttss2si %xmm3, %rdx +; SSE-NEXT: ucomiss %xmm1, %xmm3 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: subss %xmm1, %xmm4 +; SSE-NEXT: cvttss2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: ucomiss %xmm1, %xmm0 +; SSE-NEXT: cmovaeq %rcx, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_8f32_to_4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm2, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm2 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vcvttss2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm3, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm3 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm0, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vcvttss2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm0, %rcx +; AVX1-NEXT: vucomiss %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_8f32_to_4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vcvttss2si %xmm3, %rax +; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm2, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm2 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4 +; AVX2-NEXT: vcvttss2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm3, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm3 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; AVX2-NEXT: vcvttss2si %xmm3, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm0, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm4 +; AVX2-NEXT: vcvttss2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm0, %rcx +; AVX2-NEXT: vucomiss %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_8f32_to_4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2usi %xmm1, %rax +; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512F-NEXT: vcvttss2usi %xmm1, %rdx +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvttss2usi %xmm0, %rsi +; AVX512F-NEXT: vmovq %rsi, %xmm0 +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rcx, %xmm1 +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_8f32_to_4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512VL-NEXT: vcvttss2usi %xmm1, %rdx +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rsi +; AVX512VL-NEXT: vmovq %rsi, %xmm0 +; AVX512VL-NEXT: vmovq %rdx, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rcx, %xmm1 +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_8f32_to_4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_8f32_to_4i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512VLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <8 x float> %a to <8 x i64> + %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i64> %shuf +} + +; +; Constant Folding +; + +define <2 x i64> @fptosi_2f64_to_2i64_const() { +; SSE-LABEL: fptosi_2f64_to_2i64_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f64_to_2i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,18446744073709551615] +; AVX-NEXT: retq + %cvt = fptosi <2 x double> <double 1.0, double -1.0> to <2 x i64> + ret <2 x i64> %cvt +} + +define <4 x i32> @fptosi_2f64_to_2i32_const() { +; SSE-LABEL: fptosi_2f64_to_2i32_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = <4294967295,1,u,u> +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f64_to_2i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u> +; AVX-NEXT: retq + %cvt = fptosi <2 x double> <double -1.0, double 1.0> to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + ret <4 x i32> %ext +} + +define <4 x i64> @fptosi_4f64_to_4i64_const() { +; SSE-LABEL: fptosi_4f64_to_4i64_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [2,18446744073709551613] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_4f64_to_4i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613] +; AVX-NEXT: retq + %cvt = fptosi <4 x double> <double 1.0, double -1.0, double 2.0, double -3.0> to <4 x i64> + ret <4 x i64> %cvt +} + +define <4 x i32> @fptosi_4f64_to_4i32_const() { +; SSE-LABEL: fptosi_4f64_to_4i32_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_4f64_to_4i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] +; AVX-NEXT: retq + %cvt = fptosi <4 x double> <double -1.0, double 1.0, double -2.0, double 3.0> to <4 x i32> + ret <4 x i32> %cvt +} + +define <2 x i64> @fptoui_2f64_to_2i64_const() { +; SSE-LABEL: fptoui_2f64_to_2i64_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4] +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_2f64_to_2i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4] +; AVX-NEXT: retq + %cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i64> + ret <2 x i64> %cvt +} + +define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) { +; SSE-LABEL: fptoui_2f64_to_2i32_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = <2,4,u,u> +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_2f64_to_2i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u> +; AVX-NEXT: retq + %cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + ret <4 x i32> %ext +} + +define <4 x i64> @fptoui_4f64_to_4i64_const(<4 x double> %a) { +; SSE-LABEL: fptoui_4f64_to_4i64_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [6,8] +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_4f64_to_4i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [2,4,6,8] +; AVX-NEXT: retq + %cvt = fptoui <4 x double> <double 2.0, double 4.0, double 6.0, double 8.0> to <4 x i64> + ret <4 x i64> %cvt +} + +define <4 x i32> @fptoui_4f64_to_4i32_const(<4 x double> %a) { +; SSE-LABEL: fptoui_4f64_to_4i32_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4,6,8] +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_4f64_to_4i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4,6,8] +; AVX-NEXT: retq + %cvt = fptoui <4 x double> <double 2.0, double 4.0, double 6.0, double 8.0> to <4 x i32> + ret <4 x i32> %cvt +} + +define <4 x i32> @fptosi_4f32_to_4i32_const() { +; SSE-LABEL: fptosi_4f32_to_4i32_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,4294967295,2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_4f32_to_4i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3] +; AVX-NEXT: retq + %cvt = fptosi <4 x float> <float 1.0, float -1.0, float 2.0, float 3.0> to <4 x i32> + ret <4 x i32> %cvt +} + +define <4 x i64> @fptosi_4f32_to_4i64_const() { +; SSE-LABEL: fptosi_4f32_to_4i64_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_4f32_to_4i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3] +; AVX-NEXT: retq + %cvt = fptosi <4 x float> <float 1.0, float -1.0, float 2.0, float 3.0> to <4 x i64> + ret <4 x i64> %cvt +} + +define <8 x i32> @fptosi_8f32_to_8i32_const(<8 x float> %a) { +; SSE-LABEL: fptosi_8f32_to_8i32_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,4294967295,2,3] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [6,4294967288,2,4294967295] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_8f32_to_8i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295] +; AVX-NEXT: retq + %cvt = fptosi <8 x float> <float 1.0, float -1.0, float 2.0, float 3.0, float 6.0, float -8.0, float 2.0, float -1.0> to <8 x i32> + ret <8 x i32> %cvt +} + +define <4 x i32> @fptoui_4f32_to_4i32_const(<4 x float> %a) { +; SSE-LABEL: fptoui_4f32_to_4i32_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2,4,6] +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_4f32_to_4i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,4,6] +; AVX-NEXT: retq + %cvt = fptoui <4 x float> <float 1.0, float 2.0, float 4.0, float 6.0> to <4 x i32> + ret <4 x i32> %cvt +} + +define <4 x i64> @fptoui_4f32_to_4i64_const() { +; SSE-LABEL: fptoui_4f32_to_4i64_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [4,8] +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_4f32_to_4i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8] +; AVX-NEXT: retq + %cvt = fptoui <4 x float> <float 1.0, float 2.0, float 4.0, float 8.0> to <4 x i64> + ret <4 x i64> %cvt +} + +define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) { +; SSE-LABEL: fptoui_8f32_to_8i32_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2,4,6] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [8,6,4,1] +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_8f32_to_8i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1] +; AVX-NEXT: retq + %cvt = fptoui <8 x float> <float 1.0, float 2.0, float 4.0, float 6.0, float 8.0, float 6.0, float 4.0, float 1.0> to <8 x i32> + ret <8 x i32> %cvt +} + +; +; Special Cases +; + +define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind { +; SSE-LABEL: fptosi_2f16_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rax +; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __gnu_f2h_ieee +; SSE-NEXT: movzwl %ax, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: callq __gnu_f2h_ieee +; SSE-NEXT: movzwl %ax, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: cvttss2si %xmm0, %eax +; SSE-NEXT: cvttss2si (%rsp), %ecx # 4-byte Folded Reload +; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero +; SSE-NEXT: popq %rax +; SSE-NEXT: retq +; +; VEX-LABEL: fptosi_2f16_to_4i32: +; VEX: # %bb.0: +; VEX-NEXT: pushq %rax +; VEX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; VEX-NEXT: vmovaps %xmm1, %xmm0 +; VEX-NEXT: callq __gnu_f2h_ieee +; VEX-NEXT: movzwl %ax, %edi +; VEX-NEXT: callq __gnu_h2f_ieee +; VEX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; VEX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; VEX-NEXT: # xmm0 = mem[0],zero,zero,zero +; VEX-NEXT: callq __gnu_f2h_ieee +; VEX-NEXT: movzwl %ax, %edi +; VEX-NEXT: callq __gnu_h2f_ieee +; VEX-NEXT: vcvttss2si %xmm0, %eax +; VEX-NEXT: vcvttss2si (%rsp), %ecx # 4-byte Folded Reload +; VEX-NEXT: vmovd %ecx, %xmm0 +; VEX-NEXT: vmovd %eax, %xmm1 +; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; VEX-NEXT: popq %rax +; VEX-NEXT: retq +; +; AVX512-LABEL: fptosi_2f16_to_4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vcvttss2si %xmm0, %eax +; AVX512-NEXT: vcvttss2si %xmm1, %ecx +; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512-NEXT: retq + %cvt = fptosi <2 x half> %a to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i32> %ext +} + +define <4 x i32> @fptosi_2f80_to_4i32(<2 x x86_fp80> %a) nounwind { +; SSE-LABEL: fptosi_2f80_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) +; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp) +; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) +; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) +; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp) +; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f80_to_4i32: +; AVX: # %bb.0: +; AVX-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp) +; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: retq + %cvt = fptosi <2 x x86_fp80> %a to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i32> %ext +} + +define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind { +; SSE-LABEL: fptosi_2f128_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq %rcx, %r14 +; SSE-NEXT: movq %rdx, %rbx +; SSE-NEXT: callq __fixtfsi +; SSE-NEXT: movl %eax, %ebp +; SSE-NEXT: movq %rbx, %rdi +; SSE-NEXT: movq %r14, %rsi +; SSE-NEXT: callq __fixtfsi +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd %ebp, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f128_to_4i32: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbp +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: movq %rcx, %r14 +; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: callq __fixtfsi +; AVX-NEXT: movl %eax, %ebp +; AVX-NEXT: movq %rbx, %rdi +; AVX-NEXT: movq %r14, %rsi +; AVX-NEXT: callq __fixtfsi +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vmovd %ebp, %xmm1 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %rbp +; AVX-NEXT: retq + %cvt = fptosi <2 x fp128> %a to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i32> %ext +} + +define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) { +; SSE-LABEL: fptosi_2f32_to_2i8: +; SSE: # %bb.0: +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f32_to_2i8: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq + %cvt = fptosi <2 x float> %a to <2 x i8> + ret <2 x i8> %cvt +} + +define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) { +; SSE-LABEL: fptosi_2f32_to_2i16: +; SSE: # %bb.0: +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f32_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: retq + %cvt = fptosi <2 x float> %a to <2 x i16> + ret <2 x i16> %cvt +} + +define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) { +; SSE-LABEL: fptoui_2f32_to_2i8: +; SSE: # %bb.0: +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_2f32_to_2i8: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq + %cvt = fptoui <2 x float> %a to <2 x i8> + ret <2 x i8> %cvt +} + +define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) { +; SSE-LABEL: fptoui_2f32_to_2i16: +; SSE: # %bb.0: +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_2f32_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: retq + %cvt = fptoui <2 x float> %a to <2 x i16> + ret <2 x i16> %cvt +} + +define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) { +; SSE-LABEL: fptosi_2f64_to_2i8: +; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: andpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f64_to_2i8: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq + %cvt = fptosi <2 x double> %a to <2 x i8> + ret <2 x i8> %cvt +} + +define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) { +; SSE-LABEL: fptosi_2f64_to_2i16: +; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f64_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: retq + %cvt = fptosi <2 x double> %a to <2 x i16> + ret <2 x i16> %cvt +} + +define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) { +; SSE-LABEL: fptoui_2f64_to_2i8: +; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: andpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_2f64_to_2i8: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq + %cvt = fptoui <2 x double> %a to <2 x i8> + ret <2 x i8> %cvt +} + +define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) { +; SSE-LABEL: fptoui_2f64_to_2i16: +; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_2f64_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: retq + %cvt = fptoui <2 x double> %a to <2 x i16> + ret <2 x i16> %cvt +} + +define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) { +; SSE-LABEL: fptosi_8f64_to_8i16: +; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm3, %xmm3 +; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptosi_8f64_to_8i16: +; VEX: # %bb.0: +; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1 +; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VEX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; VEX-NEXT: vzeroupper +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptosi_8f64_to_8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_8f64_to_8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_8f64_to_8i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_8f64_to_8i16: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = fptosi <8 x double> %a to <8 x i16> + ret <8 x i16> %cvt +} + +define <8 x i16> @fptoui_8f64_to_8i16(<8 x double> %a) { +; SSE-LABEL: fptoui_8f64_to_8i16: +; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm3, %xmm3 +; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: retq +; +; VEX-LABEL: fptoui_8f64_to_8i16: +; VEX: # %bb.0: +; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1 +; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VEX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; VEX-NEXT: vzeroupper +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptoui_8f64_to_8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_8f64_to_8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_8f64_to_8i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_8f64_to_8i16: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <8 x double> %a to <8 x i16> + ret <8 x i16> %cvt +} + +define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) { +; SSE-LABEL: fptosi_16f32_to_16i8: +; SSE: # %bb.0: +; SSE-NEXT: cvttps2dq %xmm3, %xmm3 +; SSE-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptosi_16f32_to_16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_16f32_to_16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptosi_16f32_to_16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvttps2dq %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = fptosi <16 x float> %a to <16 x i8> + ret <16 x i8> %cvt +} + +define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) { +; SSE-LABEL: fptoui_16f32_to_16i8: +; SSE: # %bb.0: +; SSE-NEXT: cvttps2dq %xmm3, %xmm3 +; SSE-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_16f32_to_16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_16f32_to_16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptoui_16f32_to_16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvttps2dq %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = fptoui <16 x float> %a to <16 x i8> + ret <16 x i8> %cvt +} + +define <2 x i64> @fptosi_2f32_to_2i64_load(<2 x float>* %x) { +; SSE-LABEL: fptosi_2f32_to_2i64_load: +; SSE: # %bb.0: +; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; VEX-LABEL: fptosi_2f32_to_2i64_load: +; VEX: # %bb.0: +; VEX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; VEX-NEXT: vcvttss2si %xmm0, %rax +; VEX-NEXT: vmovq %rax, %xmm1 +; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; VEX-NEXT: vcvttss2si %xmm0, %rax +; VEX-NEXT: vmovq %rax, %xmm0 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptosi_2f32_to_2i64_load: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_2f32_to_2i64_load: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512VL-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_2f32_to_2i64_load: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_2f32_to_2i64_load: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2qq (%rdi), %xmm0 +; AVX512VLDQ-NEXT: retq + %a = load <2 x float>, <2 x float>* %x + %b = fptosi <2 x float> %a to <2 x i64> + ret <2 x i64> %b +} + +define <2 x i64> @fptoui_2f32_to_2i64_load(<2 x float>* %x) { +; SSE-LABEL: fptoui_2f32_to_2i64_load: +; SSE: # %bb.0: +; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: subss %xmm2, %xmm0 +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttss2si %xmm1, %rdx +; SSE-NEXT: ucomiss %xmm2, %xmm1 +; SSE-NEXT: cmovaeq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: subss %xmm2, %xmm3 +; SSE-NEXT: cvttss2si %xmm3, %rax +; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttss2si %xmm1, %rcx +; SSE-NEXT: ucomiss %xmm2, %xmm1 +; SSE-NEXT: cmovaeq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; VEX-LABEL: fptoui_2f32_to_2i64_load: +; VEX: # %bb.0: +; VEX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2 +; VEX-NEXT: vcvttss2si %xmm2, %rax +; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vcvttss2si %xmm0, %rdx +; VEX-NEXT: vucomiss %xmm1, %xmm0 +; VEX-NEXT: cmovaeq %rax, %rdx +; VEX-NEXT: vmovq %rdx, %xmm2 +; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; VEX-NEXT: vcvttss2si %xmm3, %rax +; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vcvttss2si %xmm0, %rcx +; VEX-NEXT: vucomiss %xmm1, %xmm0 +; VEX-NEXT: cmovaeq %rax, %rcx +; VEX-NEXT: vmovq %rcx, %xmm0 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f32_to_2i64_load: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_2f32_to_2i64_load: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f32_to_2i64_load: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_2f32_to_2i64_load: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2uqq (%rdi), %xmm0 +; AVX512VLDQ-NEXT: retq + %a = load <2 x float>, <2 x float>* %x + %b = fptoui <2 x float> %a to <2 x i64> + ret <2 x i64> %b +} diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll b/llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll new file mode 100644 index 00000000000..6891a3d0245 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll @@ -0,0 +1,6008 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2 +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX1 +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX2 +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VLDQ +; +; 32-bit tests to make sure we're not doing anything stupid. +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse4.1 + +; +; Signed Integer to Double +; + +define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) { +; SSE2-LABEL: sitofp_2i64_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2sd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2sd %rax, %xmm0 +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_2i64_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2sd %rax, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2sd %rax, %xmm0 +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_2i64_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: sitofp_2i64_to_2f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_2i64_to_2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_2i64_to_2f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_2i64_to_2f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = sitofp <2 x i64> %a to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) { +; SSE-LABEL: sitofp_2i32_to_2f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_2i32_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1> + %cvt = sitofp <2 x i32> %shuf to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) { +; SSE-LABEL: sitofp_4i32_to_2f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_4i32_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp <4 x i32> %a to <4 x double> + %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1> + ret <2 x double> %shuf +} + +define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) { +; SSE2-LABEL: sitofp_2i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_2i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_2i16_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1> + %cvt = sitofp <2 x i16> %shuf to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) { +; SSE2-LABEL: sitofp_8i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_8i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_8i16_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vpmovsxwd %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: sitofp_8i16_to_2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = sitofp <8 x i16> %a to <8 x double> + %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1> + ret <2 x double> %shuf +} + +define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) { +; SSE2-LABEL: sitofp_2i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_2i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_2i8_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1> + %cvt = sitofp <2 x i8> %shuf to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) { +; SSE2-LABEL: sitofp_16i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_16i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_16i8_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vpmovsxbd %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: sitofp_16i8_to_2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = sitofp <16 x i8> %a to <16 x double> + %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1> + ret <2 x double> %shuf +} + +define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { +; SSE2-LABEL: sitofp_4i64_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2sd %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2sd %rax, %xmm0 +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: cvtsi2sd %rax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2sd %rax, %xmm0 +; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i64_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2sd %rax, %xmm2 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2sd %rax, %xmm0 +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2sd %rax, %xmm2 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2sd %rax, %xmm1 +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_4i64_to_4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_4i64_to_4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sitofp_4i64_to_4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_4i64_to_4f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_4i64_to_4f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_4i64_to_4f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2pd %ymm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %cvt = sitofp <4 x i64> %a to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) { +; SSE-LABEL: sitofp_4i32_to_4f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_4i32_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %cvt = sitofp <4 x i32> %a to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) { +; SSE2-LABEL: sitofp_4i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_4i16_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %cvt = sitofp <4 x i16> %shuf to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) { +; SSE2-LABEL: sitofp_8i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_8i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_8i16_to_4f64: +; VEX: # %bb.0: +; VEX-NEXT: vpmovsxwd %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: sitofp_8i16_to_4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq + %cvt = sitofp <8 x i16> %a to <8 x double> + %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x double> %shuf +} + +define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) { +; SSE2-LABEL: sitofp_4i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_4i8_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %cvt = sitofp <4 x i8> %shuf to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) { +; SSE2-LABEL: sitofp_16i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_16i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_16i8_to_4f64: +; VEX: # %bb.0: +; VEX-NEXT: vpmovsxbd %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: sitofp_16i8_to_4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq + %cvt = sitofp <16 x i8> %a to <16 x double> + %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x double> %shuf +} + +; +; Unsigned Integer to Double +; + +define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) { +; SSE2-LABEL: uitofp_2i64_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: por {{.*}}(%rip), %xmm0 +; SSE2-NEXT: subpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i64_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: por {{.*}}(%rip), %xmm1 +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: por {{.*}}(%rip), %xmm0 +; SSE41-NEXT: subpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_2i64_to_2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_2i64_to_2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_2i64_to_2f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_2i64_to_2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_2i64_to_2f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_2i64_to_2f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <2 x i64> %a to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) { +; SSE2-LABEL: uitofp_2i32_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i32_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_2i32_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] +; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 +; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_2i32_to_2f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_2i32_to_2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_2i32_to_2f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_2i32_to_2f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1> + %cvt = uitofp <2 x i32> %shuf to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) { +; SSE2-LABEL: uitofp_4i32_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i32_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_4i32_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_4i32_to_2f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_4i32_to_2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_4i32_to_2f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_4i32_to_2f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <4 x i32> %a to <4 x double> + %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1> + ret <2 x double> %shuf +} + +define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) { +; SSE2-LABEL: uitofp_2i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_2i16_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1> + %cvt = uitofp <2 x i16> %shuf to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) { +; SSE2-LABEL: uitofp_8i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_8i16_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: uitofp_8i16_to_2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = uitofp <8 x i16> %a to <8 x double> + %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1> + ret <2 x double> %shuf +} + +define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) { +; SSE2-LABEL: uitofp_2i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_2i8_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1> + %cvt = uitofp <2 x i8> %shuf to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) { +; SSE2-LABEL: uitofp_16i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_16i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_16i8_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: uitofp_16i8_to_2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = uitofp <16 x i8> %a to <16 x double> + %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1> + ret <2 x double> %shuf +} + +define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { +; SSE2-LABEL: uitofp_4i64_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; SSE2-NEXT: subpd %xmm6, %xmm0 +; SSE2-NEXT: addpd %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrlq $32, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: subpd %xmm6, %xmm1 +; SSE2-NEXT: addpd %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i64_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; SSE41-NEXT: por %xmm4, %xmm3 +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; SSE41-NEXT: subpd %xmm6, %xmm0 +; SSE41-NEXT: addpd %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: por %xmm4, %xmm2 +; SSE41-NEXT: psrlq $32, %xmm1 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: subpd %xmm6, %xmm1 +; SSE41-NEXT: addpd %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_4i64_to_4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_4i64_to_4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] +; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_4i64_to_4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] +; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_4i64_to_4f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1 +; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_4i64_to_4f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_4i64_to_4f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2pd %ymm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <4 x i64> %a to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) { +; SSE2-LABEL: uitofp_4i32_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] +; SSE2-NEXT: mulpd %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm5 +; SSE2-NEXT: mulpd %xmm2, %xmm5 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: cvtdq2pd %xmm4, %xmm1 +; SSE2-NEXT: addpd %xmm5, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i32_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] +; SSE41-NEXT: mulpd %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5,6,7] +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm5 +; SSE41-NEXT: mulpd %xmm2, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; SSE41-NEXT: cvtdq2pd %xmm4, %xmm1 +; SSE41-NEXT: addpd %xmm5, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_4i32_to_4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_4i32_to_4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] +; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_4i32_to_4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_4i32_to_4f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_4i32_to_4f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_4i32_to_4f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <4 x i32> %a to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) { +; SSE2-LABEL: uitofp_4i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_4i16_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %cvt = uitofp <4 x i16> %shuf to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) { +; SSE2-LABEL: uitofp_8i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_8i16_to_4f64: +; VEX: # %bb.0: +; VEX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: uitofp_8i16_to_4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq + %cvt = uitofp <8 x i16> %a to <8 x double> + %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x double> %shuf +} + +define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) { +; SSE2-LABEL: uitofp_4i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_4i8_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %cvt = uitofp <4 x i8> %shuf to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { +; SSE2-LABEL: uitofp_16i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_16i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_16i8_to_4f64: +; VEX: # %bb.0: +; VEX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: uitofp_16i8_to_4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq + %cvt = uitofp <16 x i8> %a to <16 x double> + %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x double> %shuf +} + +; +; Signed Integer to Float +; + +define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { +; SSE2-LABEL: sitofp_2i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_2i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_2i64_to_4f32: +; VEX: # %bb.0: +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; VEX-NEXT: retq +; +; AVX512F-LABEL: sitofp_2i64_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_2i64_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_2i64_to_4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = sitofp <2 x i64> %a to <2 x float> + %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + ret <4 x float> %ext +} + +define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) { +; SSE2-LABEL: sitofp_2i64_to_4f32_zero: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_2i64_to_4f32_zero: +; SSE41: # %bb.0: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],zero,zero +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_2i64_to_4f32_zero: +; VEX: # %bb.0: +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; VEX-NEXT: retq +; +; AVX512F-LABEL: sitofp_2i64_to_4f32_zero: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_2i64_to_4f32_zero: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_2i64_to_4f32_zero: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32_zero: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = sitofp <2 x i64> %a to <2 x float> + %ext = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x float> %ext +} + +define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { +; SSE2-LABEL: sitofp_4i64_to_4f32_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i64_to_4f32_undef: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_4i64_to_4f32_undef: +; VEX: # %bb.0: +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; VEX-NEXT: retq +; +; AVX512F-LABEL: sitofp_4i64_to_4f32_undef: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_4i64_to_4f32_undef: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_4i64_to_4f32_undef: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32_undef: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %cvt = sitofp <4 x i64> %ext to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) { +; SSE-LABEL: sitofp_4i32_to_4f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_4i32_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp <4 x i32> %a to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) { +; SSE2-LABEL: sitofp_4i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_4i16_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %cvt = sitofp <4 x i16> %shuf to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) { +; SSE2-LABEL: sitofp_8i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_8i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_8i16_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_8i16_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: sitofp_8i16_to_4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = sitofp <8 x i16> %a to <8 x float> + %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x float> %shuf +} + +define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) { +; SSE2-LABEL: sitofp_4i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_4i8_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %cvt = sitofp <4 x i8> %shuf to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) { +; SSE2-LABEL: sitofp_16i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_16i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_16i8_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_16i8_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: sitofp_16i8_to_4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = sitofp <16 x i8> %a to <16 x float> + %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x float> %shuf +} + +define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { +; SSE2-LABEL: sitofp_4i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_4i64_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_4i64_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sitofp_4i64_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_4i64_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_4i64_to_4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = sitofp <4 x i64> %a to <4 x float> + ret <4 x float> %cvt +} + +define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) { +; SSE-LABEL: sitofp_8i32_to_8f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_8i32_to_8f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: retq + %cvt = sitofp <8 x i32> %a to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) { +; SSE2-LABEL: sitofp_8i16_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_8i16_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_8i16_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_8i16_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sitofp_8i16_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: retq + %cvt = sitofp <8 x i16> %a to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) { +; SSE2-LABEL: sitofp_8i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_8i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_8i8_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_8i8_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sitofp_8i8_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: retq + %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %cvt = sitofp <8 x i8> %shuf to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) { +; SSE2-LABEL: sitofp_16i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_16i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_16i8_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_16i8_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sitofp_16i8_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq + %cvt = sitofp <16 x i8> %a to <16 x float> + %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %shuf +} + +; +; Unsigned Integer to Float +; + +define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { +; SSE2-LABEL: uitofp_2i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB39_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: jmp .LBB39_3 +; SSE2-NEXT: .LBB39_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB39_3: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB39_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB39_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB39_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: jmp .LBB39_3 +; SSE41-NEXT: .LBB39_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: .LBB39_3: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB39_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: retq +; SSE41-NEXT: .LBB39_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_2i64_to_4f32: +; VEX: # %bb.0: +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB39_1 +; VEX-NEXT: # %bb.2: +; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; VEX-NEXT: jmp .LBB39_3 +; VEX-NEXT: .LBB39_1: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; VEX-NEXT: .LBB39_3: +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB39_4 +; VEX-NEXT: # %bb.5: +; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; VEX-NEXT: retq +; VEX-NEXT: .LBB39_4: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_2i64_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_2i64_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_2i64_to_4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_2i64_to_4f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <2 x i64> %a to <2 x float> + %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + ret <4 x float> %ext +} + +define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) { +; SSE2-LABEL: uitofp_2i64_to_2f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB40_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: jmp .LBB40_3 +; SSE2-NEXT: .LBB40_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: .LBB40_3: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB40_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: jmp .LBB40_6 +; SSE2-NEXT: .LBB40_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB40_6: +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i64_to_2f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB40_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: jmp .LBB40_3 +; SSE41-NEXT: .LBB40_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: .LBB40_3: +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB40_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; SSE41-NEXT: retq +; SSE41-NEXT: .LBB40_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_2i64_to_2f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB40_1 +; VEX-NEXT: # %bb.2: +; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; VEX-NEXT: jmp .LBB40_3 +; VEX-NEXT: .LBB40_1: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; VEX-NEXT: .LBB40_3: +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB40_4 +; VEX-NEXT: # %bb.5: +; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; VEX-NEXT: retq +; VEX-NEXT: .LBB40_4: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_2i64_to_2f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_2i64_to_2f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_2i64_to_2f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_2i64_to_2f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <2 x i64> %a to <2 x float> + %ext = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x float> %ext +} + +define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { +; SSE2-LABEL: uitofp_4i64_to_4f32_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB41_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: jmp .LBB41_3 +; SSE2-NEXT: .LBB41_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: .LBB41_3: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB41_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: jmp .LBB41_6 +; SSE2-NEXT: .LBB41_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB41_6: +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i64_to_4f32_undef: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB41_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: jmp .LBB41_3 +; SSE41-NEXT: .LBB41_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: .LBB41_3: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB41_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; SSE41-NEXT: retq +; SSE41-NEXT: .LBB41_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_4i64_to_4f32_undef: +; VEX: # %bb.0: +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB41_1 +; VEX-NEXT: # %bb.2: +; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; VEX-NEXT: jmp .LBB41_3 +; VEX-NEXT: .LBB41_1: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; VEX-NEXT: .LBB41_3: +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB41_4 +; VEX-NEXT: # %bb.5: +; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; VEX-NEXT: retq +; VEX-NEXT: .LBB41_4: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_4i64_to_4f32_undef: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_4i64_to_4f32_undef: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_4i64_to_4f32_undef: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32_undef: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %cvt = uitofp <4 x i64> %ext to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) { +; SSE2-LABEL: uitofp_4i32_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: por {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i32_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; SSE41-NEXT: addps {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_4i32_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_4i32_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_4i32_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_4i32_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_4i32_to_4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_4i32_to_4f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <4 x i32> %a to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) { +; SSE2-LABEL: uitofp_4i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_4i16_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %cvt = uitofp <4 x i16> %shuf to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) { +; SSE2-LABEL: uitofp_8i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_8i16_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_8i16_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: uitofp_8i16_to_4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = uitofp <8 x i16> %a to <8 x float> + %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x float> %shuf +} + +define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) { +; SSE2-LABEL: uitofp_4i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_4i8_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %cvt = uitofp <4 x i8> %shuf to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) { +; SSE2-LABEL: uitofp_16i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_16i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_16i8_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_16i8_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: uitofp_16i8_to_4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = uitofp <16 x i8> %a to <16 x float> + %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x float> %shuf +} + +define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { +; SSE2-LABEL: uitofp_4i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB47_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: cvtsi2ss %rax, %xmm2 +; SSE2-NEXT: jmp .LBB47_3 +; SSE2-NEXT: .LBB47_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm2 +; SSE2-NEXT: addss %xmm2, %xmm2 +; SSE2-NEXT: .LBB47_3: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB47_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: cvtsi2ss %rax, %xmm3 +; SSE2-NEXT: jmp .LBB47_6 +; SSE2-NEXT: .LBB47_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm3 +; SSE2-NEXT: addss %xmm3, %xmm3 +; SSE2-NEXT: .LBB47_6: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB47_7 +; SSE2-NEXT: # %bb.8: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: jmp .LBB47_9 +; SSE2-NEXT: .LBB47_7: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: .LBB47_9: +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB47_10 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: jmp .LBB47_12 +; SSE2-NEXT: .LBB47_10: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB47_12: +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB47_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: jmp .LBB47_3 +; SSE41-NEXT: .LBB47_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm2 +; SSE41-NEXT: .LBB47_3: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB47_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: jmp .LBB47_6 +; SSE41-NEXT: .LBB47_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: .LBB47_6: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB47_7 +; SSE41-NEXT: # %bb.8: +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: jmp .LBB47_9 +; SSE41-NEXT: .LBB47_7: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm2 +; SSE41-NEXT: .LBB47_9: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB47_10 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; SSE41-NEXT: .LBB47_10: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_4i64_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB47_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: jmp .LBB47_3 +; AVX1-NEXT: .LBB47_1: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: .LBB47_3: +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB47_4 +; AVX1-NEXT: # %bb.5: +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: jmp .LBB47_6 +; AVX1-NEXT: .LBB47_4: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: .LBB47_6: +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB47_7 +; AVX1-NEXT: # %bb.8: +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: jmp .LBB47_9 +; AVX1-NEXT: .LBB47_7: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: .LBB47_9: +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB47_10 +; AVX1-NEXT: # %bb.11: +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB47_10: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_4i64_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: testq %rax, %rax +; AVX2-NEXT: js .LBB47_1 +; AVX2-NEXT: # %bb.2: +; AVX2-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX2-NEXT: jmp .LBB47_3 +; AVX2-NEXT: .LBB47_1: +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: .LBB47_3: +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: testq %rax, %rax +; AVX2-NEXT: js .LBB47_4 +; AVX2-NEXT: # %bb.5: +; AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX2-NEXT: jmp .LBB47_6 +; AVX2-NEXT: .LBB47_4: +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: .LBB47_6: +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: testq %rax, %rax +; AVX2-NEXT: js .LBB47_7 +; AVX2-NEXT: # %bb.8: +; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX2-NEXT: jmp .LBB47_9 +; AVX2-NEXT: .LBB47_7: +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: .LBB47_9: +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: testq %rax, %rax +; AVX2-NEXT: js .LBB47_10 +; AVX2-NEXT: # %bb.11: +; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB47_10: +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_4i64_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_4i64_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_4i64_to_4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <4 x i64> %a to <4 x float> + ret <4 x float> %cvt +} + +define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) { +; SSE2-LABEL: uitofp_8i32_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm6 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; SSE2-NEXT: addps %xmm6, %xmm0 +; SSE2-NEXT: addps %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: addps %xmm6, %xmm1 +; SSE2-NEXT: addps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i32_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] +; SSE41-NEXT: movaps {{.*#+}} xmm5 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; SSE41-NEXT: addps %xmm5, %xmm0 +; SSE41-NEXT: addps %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] +; SSE41-NEXT: addps %xmm5, %xmm1 +; SSE41-NEXT: addps %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_8i32_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_8i32_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_8i32_to_8f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_8i32_to_8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2ps %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_8i32_to_8f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_8i32_to_8f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2ps %ymm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <8 x i32> %a to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) { +; SSE2-LABEL: uitofp_8i16_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i16_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_8i16_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_8i16_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: uitofp_8i16_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: retq + %cvt = uitofp <8 x i16> %a to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) { +; SSE2-LABEL: uitofp_8i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_8i8_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_8i8_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: uitofp_8i8_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: retq + %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %cvt = uitofp <8 x i8> %shuf to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) { +; SSE2-LABEL: uitofp_16i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_16i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_16i8_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_16i8_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: uitofp_16i8_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq + %cvt = uitofp <16 x i8> %a to <16 x float> + %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %shuf +} + +; +; Load Signed Integer to Double +; + +define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) { +; SSE2-LABEL: sitofp_load_2i64_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: cvtsi2sd %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2sd %rax, %xmm1 +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_2i64_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2sd %rax, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2sd %rax, %xmm0 +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_load_2i64_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: sitofp_load_2i64_to_2f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_load_2i64_to_2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_load_2i64_to_2f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_load_2i64_to_2f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2pd (%rdi), %xmm0 +; AVX512VLDQ-NEXT: retq + %ld = load <2 x i64>, <2 x i64> *%a + %cvt = sitofp <2 x i64> %ld to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) { +; SSE-LABEL: sitofp_load_2i32_to_2f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2pd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_load_2i32_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0 +; AVX-NEXT: retq + %ld = load <2 x i32>, <2 x i32> *%a + %cvt = sitofp <2 x i32> %ld to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @sitofp_volatile_load_4i32_to_2f64(<4 x i32> *%a) { +; SSE-LABEL: sitofp_volatile_load_4i32_to_2f64: +; SSE: # %bb.0: +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_volatile_load_4i32_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load volatile <4 x i32>, <4 x i32> *%a + %b = shufflevector <4 x i32> %ld, <4 x i32> undef, <2 x i32> <i32 0, i32 1> + %cvt = sitofp <2 x i32> %b to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @sitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { +; SSE-LABEL: sitofp_load_4i32_to_2f64_2: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2pd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_load_4i32_to_2f64_2: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0 +; AVX-NEXT: retq + %a = load <4 x i32>, <4 x i32>* %x + %b = sitofp <4 x i32> %a to <4 x double> + %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> <i32 0, i32 1> + ret <2 x double> %c +} + +define <2 x double> @sitofp_volatile_load_4i32_to_2f64_2(<4 x i32>* %x) { +; SSE-LABEL: sitofp_volatile_load_4i32_to_2f64_2: +; SSE: # %bb.0: +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_volatile_load_4i32_to_2f64_2: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %a = load volatile <4 x i32>, <4 x i32>* %x + %b = sitofp <4 x i32> %a to <4 x double> + %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> <i32 0, i32 1> + ret <2 x double> %c +} + +define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) { +; SSE2-LABEL: sitofp_load_2i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_2i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_load_2i16_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <2 x i16>, <2 x i16> *%a + %cvt = sitofp <2 x i16> %ld to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) { +; SSE2-LABEL: sitofp_load_2i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_2i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_load_2i8_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <2 x i8>, <2 x i8> *%a + %cvt = sitofp <2 x i8> %ld to <2 x double> + ret <2 x double> %cvt +} + +define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { +; SSE2-LABEL: sitofp_load_4i64_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: cvtsi2sd %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2sd %rax, %xmm1 +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2sd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2sd %rax, %xmm2 +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i64_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2sd %rax, %xmm2 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2sd %rax, %xmm0 +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2sd %rax, %xmm2 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2sd %rax, %xmm1 +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_load_4i64_to_4f64: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; VEX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: sitofp_load_4i64_to_4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_load_4i64_to_4f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2pd (%rdi), %ymm0 +; AVX512VLDQ-NEXT: retq + %ld = load <4 x i64>, <4 x i64> *%a + %cvt = sitofp <4 x i64> %ld to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) { +; SSE-LABEL: sitofp_load_4i32_to_4f64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_load_4i32_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2pd (%rdi), %ymm0 +; AVX-NEXT: retq + %ld = load <4 x i32>, <4 x i32> *%a + %cvt = sitofp <4 x i32> %ld to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) { +; SSE2-LABEL: sitofp_load_4i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd (%rdi), %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_load_4i16_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %ld = load <4 x i16>, <4 x i16> *%a + %cvt = sitofp <4 x i16> %ld to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) { +; SSE2-LABEL: sitofp_load_4i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd (%rdi), %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_load_4i8_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %ld = load <4 x i8>, <4 x i8> *%a + %cvt = sitofp <4 x i8> %ld to <4 x double> + ret <4 x double> %cvt +} + +; +; Load Unsigned Integer to Double +; + +define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) { +; SSE2-LABEL: uitofp_load_2i64_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: por {{.*}}(%rip), %xmm0 +; SSE2-NEXT: subpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_2i64_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: por {{.*}}(%rip), %xmm1 +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: por {{.*}}(%rip), %xmm0 +; SSE41-NEXT: subpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_load_2i64_to_2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_2i64_to_2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_2i64_to_2f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_2i64_to_2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_2i64_to_2f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_2i64_to_2f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2pd (%rdi), %xmm0 +; AVX512VLDQ-NEXT: retq + %ld = load <2 x i64>, <2 x i64> *%a + %cvt = uitofp <2 x i64> %ld to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) { +; SSE2-LABEL: uitofp_load_2i32_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_2i32_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_load_2i32_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] +; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 +; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_2i32_to_2f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_2i32_to_2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2pd (%rdi), %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_2i32_to_2f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_2i32_to_2f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0 +; AVX512VLDQ-NEXT: retq + %ld = load <2 x i32>, <2 x i32> *%a + %cvt = uitofp <2 x i32> %ld to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { +; SSE2-LABEL: uitofp_load_4i32_to_2f64_2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i32_to_2f64_2: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_load_4i32_to_2f64_2: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_4i32_to_2f64_2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_4i32_to_2f64_2: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2pd (%rdi), %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_4i32_to_2f64_2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_4i32_to_2f64_2: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0 +; AVX512VLDQ-NEXT: retq + %a = load <4 x i32>, <4 x i32>* %x + %b = uitofp <4 x i32> %a to <4 x double> + %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> <i32 0, i32 1> + ret <2 x double> %c +} + +define <2 x double> @uitofp_volatile_load_4i32_to_2f64_2(<4 x i32>* %x) { +; SSE2-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %a = load volatile <4 x i32>, <4 x i32>* %x + %b = uitofp <4 x i32> %a to <4 x double> + %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> <i32 0, i32 1> + ret <2 x double> %c +} + +define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) { +; SSE2-LABEL: uitofp_load_2i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_2i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_load_2i16_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <2 x i16>, <2 x i16> *%a + %cvt = uitofp <2 x i16> %ld to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) { +; SSE2-LABEL: uitofp_load_2i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_2i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_load_2i8_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <2 x i8>, <2 x i8> *%a + %cvt = uitofp <2 x i8> %ld to <2 x double> + ret <2 x double> %cvt +} + +define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { +; SSE2-LABEL: uitofp_load_4i64_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; SSE2-NEXT: subpd %xmm6, %xmm0 +; SSE2-NEXT: addpd %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrlq $32, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: subpd %xmm6, %xmm1 +; SSE2-NEXT: addpd %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i64_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; SSE41-NEXT: por %xmm4, %xmm3 +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; SSE41-NEXT: subpd %xmm6, %xmm0 +; SSE41-NEXT: addpd %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: por %xmm4, %xmm2 +; SSE41-NEXT: psrlq $32, %xmm1 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: subpd %xmm6, %xmm1 +; SSE41-NEXT: addpd %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_load_4i64_to_4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7] +; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_4i64_to_4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] +; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_4i64_to_4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] +; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_4i64_to_4f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1 +; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2pd (%rdi), %ymm0 +; AVX512VLDQ-NEXT: retq + %ld = load <4 x i64>, <4 x i64> *%a + %cvt = uitofp <4 x i64> %ld to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) { +; SSE2-LABEL: uitofp_load_4i32_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] +; SSE2-NEXT: mulpd %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm5 +; SSE2-NEXT: mulpd %xmm2, %xmm5 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: cvtdq2pd %xmm4, %xmm1 +; SSE2-NEXT: addpd %xmm5, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i32_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] +; SSE41-NEXT: mulpd %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5,6,7] +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm5 +; SSE41-NEXT: mulpd %xmm2, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; SSE41-NEXT: cvtdq2pd %xmm4, %xmm1 +; SSE41-NEXT: addpd %xmm5, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_load_4i32_to_4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_4i32_to_4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] +; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_4i32_to_4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_4i32_to_4f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2pd (%rdi), %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_4i32_to_4f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %ymm0 +; AVX512VLDQ-NEXT: retq + %ld = load <4 x i32>, <4 x i32> *%a + %cvt = uitofp <4 x i32> %ld to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) { +; SSE2-LABEL: uitofp_load_4i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_load_4i16_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %ld = load <4 x i16>, <4 x i16> *%a + %cvt = uitofp <4 x i16> %ld to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) { +; SSE2-LABEL: uitofp_load_4i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_load_4i8_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %ld = load <4 x i8>, <4 x i8> *%a + %cvt = uitofp <4 x i8> %ld to <4 x double> + ret <4 x double> %cvt +} + +; +; Load Signed Integer to Float +; + +define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { +; SSE2-LABEL: sitofp_load_4i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_load_4i64_to_4f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: sitofp_load_4i64_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_load_4i64_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0 +; AVX512VLDQ-NEXT: retq + %ld = load <4 x i64>, <4 x i64> *%a + %cvt = sitofp <4 x i64> %ld to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) { +; SSE-LABEL: sitofp_load_4i32_to_4f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2ps (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_load_4i32_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2ps (%rdi), %xmm0 +; AVX-NEXT: retq + %ld = load <4 x i32>, <4 x i32> *%a + %cvt = sitofp <4 x i32> %ld to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) { +; SSE2-LABEL: sitofp_load_4i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_load_4i16_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <4 x i16>, <4 x i16> *%a + %cvt = sitofp <4 x i16> %ld to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) { +; SSE2-LABEL: sitofp_load_4i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_load_4i8_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <4 x i8>, <4 x i8> *%a + %cvt = sitofp <4 x i8> %ld to <4 x float> + ret <4 x float> %cvt +} + +define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { +; SSE2-LABEL: sitofp_load_8i64_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: xorps %xmm4, %xmm4 +; SSE2-NEXT: cvtsi2ss %rax, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ss %rax, %xmm2 +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_8i64_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm3 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm4 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: cvtsi2ss %rax, %xmm4 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0],xmm0[3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: pextrq $1, %xmm2, %rax +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: cvtsi2ss %rax, %xmm4 +; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3] +; SSE41-NEXT: movq %xmm3, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; SSE41-NEXT: pextrq $1, %xmm3, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_load_8i64_to_8f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 +; VEX-NEXT: vmovdqa 32(%rdi), %xmm2 +; VEX-NEXT: vmovdqa 48(%rdi), %xmm3 +; VEX-NEXT: vpextrq $1, %xmm2, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 +; VEX-NEXT: vmovq %xmm2, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; VEX-NEXT: vmovq %xmm3, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; VEX-NEXT: vpextrq $1, %xmm3, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; VEX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: sitofp_load_8i64_to_8f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vpextrq $1, %xmm2, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512F-NEXT: vmovq %xmm3, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512F-NEXT: vpextrq $1, %xmm3, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_load_8i64_to_8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 +; AVX512VL-NEXT: vmovq %xmm2, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512VL-NEXT: vmovq %xmm3, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvtqq2ps (%rdi), %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_load_8i64_to_8f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2ps (%rdi), %ymm0 +; AVX512VLDQ-NEXT: retq + %ld = load <8 x i64>, <8 x i64> *%a + %cvt = sitofp <8 x i64> %ld to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) { +; SSE-LABEL: sitofp_load_8i32_to_8f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2ps (%rdi), %xmm0 +; SSE-NEXT: cvtdq2ps 16(%rdi), %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_load_8i32_to_8f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2ps (%rdi), %ymm0 +; AVX-NEXT: retq + %ld = load <8 x i32>, <8 x i32> *%a + %cvt = sitofp <8 x i32> %ld to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) { +; SSE2-LABEL: sitofp_load_8i16_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_8i16_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1 +; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_load_8i16_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwd (%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_load_8i16_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0 +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sitofp_load_8i16_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd (%rdi), %ymm0 +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: retq + %ld = load <8 x i16>, <8 x i16> *%a + %cvt = sitofp <8 x i16> %ld to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) { +; SSE2-LABEL: sitofp_load_8i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_8i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd 4(%rdi), %xmm1 +; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_load_8i8_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbd (%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_load_8i8_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0 +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sitofp_load_8i8_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbd (%rdi), %ymm0 +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: retq + %ld = load <8 x i8>, <8 x i8> *%a + %cvt = sitofp <8 x i8> %ld to <8 x float> + ret <8 x float> %cvt +} + +; +; Load Unsigned Integer to Float +; + +define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { +; SSE2-LABEL: uitofp_load_4i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB81_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: jmp .LBB81_3 +; SSE2-NEXT: .LBB81_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: .LBB81_3: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB81_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: cvtsi2ss %rax, %xmm3 +; SSE2-NEXT: jmp .LBB81_6 +; SSE2-NEXT: .LBB81_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm3 +; SSE2-NEXT: addss %xmm3, %xmm3 +; SSE2-NEXT: .LBB81_6: +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB81_7 +; SSE2-NEXT: # %bb.8: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: jmp .LBB81_9 +; SSE2-NEXT: .LBB81_7: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB81_9: +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB81_10 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ss %rax, %xmm2 +; SSE2-NEXT: jmp .LBB81_12 +; SSE2-NEXT: .LBB81_10: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ss %rax, %xmm2 +; SSE2-NEXT: addss %xmm2, %xmm2 +; SSE2-NEXT: .LBB81_12: +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB81_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: jmp .LBB81_3 +; SSE41-NEXT: .LBB81_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm2 +; SSE41-NEXT: .LBB81_3: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB81_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: jmp .LBB81_6 +; SSE41-NEXT: .LBB81_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: .LBB81_6: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB81_7 +; SSE41-NEXT: # %bb.8: +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: jmp .LBB81_9 +; SSE41-NEXT: .LBB81_7: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm2 +; SSE41-NEXT: .LBB81_9: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB81_10 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; SSE41-NEXT: .LBB81_10: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_load_4i64_to_4f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm2 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm0 +; VEX-NEXT: vpextrq $1, %xmm2, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB81_1 +; VEX-NEXT: # %bb.2: +; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; VEX-NEXT: jmp .LBB81_3 +; VEX-NEXT: .LBB81_1: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; VEX-NEXT: .LBB81_3: +; VEX-NEXT: vmovq %xmm2, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB81_4 +; VEX-NEXT: # %bb.5: +; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; VEX-NEXT: jmp .LBB81_6 +; VEX-NEXT: .LBB81_4: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; VEX-NEXT: .LBB81_6: +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB81_7 +; VEX-NEXT: # %bb.8: +; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; VEX-NEXT: jmp .LBB81_9 +; VEX-NEXT: .LBB81_7: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; VEX-NEXT: .LBB81_9: +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB81_10 +; VEX-NEXT: # %bb.11: +; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; VEX-NEXT: retq +; VEX-NEXT: .LBB81_10: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_4i64_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_4i64_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2psy (%rdi), %xmm0 +; AVX512VLDQ-NEXT: retq + %ld = load <4 x i64>, <4 x i64> *%a + %cvt = uitofp <4 x i64> %ld to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) { +; SSE2-LABEL: uitofp_load_4i32_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: por {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i32_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; SSE41-NEXT: addps {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_load_4i32_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_4i32_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_4i32_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_4i32_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2ps (%rdi), %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_4i32_to_4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2ps (%rdi), %xmm0 +; AVX512VLDQ-NEXT: retq + %ld = load <4 x i32>, <4 x i32> *%a + %cvt = uitofp <4 x i32> %ld to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) { +; SSE2-LABEL: uitofp_load_4i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_load_4i16_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <4 x i16>, <4 x i16> *%a + %cvt = uitofp <4 x i16> %ld to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) { +; SSE2-LABEL: uitofp_load_4i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_load_4i8_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <4 x i8>, <4 x i8> *%a + %cvt = uitofp <4 x i8> %ld to <4 x float> + ret <4 x float> %cvt +} + +define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { +; SSE2-LABEL: uitofp_load_8i64_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm5 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB85_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: cvtsi2ss %rax, %xmm3 +; SSE2-NEXT: jmp .LBB85_3 +; SSE2-NEXT: .LBB85_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm3 +; SSE2-NEXT: addss %xmm3, %xmm3 +; SSE2-NEXT: .LBB85_3: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB85_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: cvtsi2ss %rax, %xmm4 +; SSE2-NEXT: jmp .LBB85_6 +; SSE2-NEXT: .LBB85_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm4 +; SSE2-NEXT: addss %xmm4, %xmm4 +; SSE2-NEXT: .LBB85_6: +; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB85_7 +; SSE2-NEXT: # %bb.8: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: jmp .LBB85_9 +; SSE2-NEXT: .LBB85_7: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB85_9: +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB85_10 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: cvtsi2ss %rax, %xmm6 +; SSE2-NEXT: jmp .LBB85_12 +; SSE2-NEXT: .LBB85_10: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm6 +; SSE2-NEXT: addss %xmm6, %xmm6 +; SSE2-NEXT: .LBB85_12: +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB85_13 +; SSE2-NEXT: # %bb.14: +; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: cvtsi2ss %rax, %xmm5 +; SSE2-NEXT: jmp .LBB85_15 +; SSE2-NEXT: .LBB85_13: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: cvtsi2ss %rax, %xmm5 +; SSE2-NEXT: addss %xmm5, %xmm5 +; SSE2-NEXT: .LBB85_15: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB85_16 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: cvtsi2ss %rax, %xmm7 +; SSE2-NEXT: jmp .LBB85_18 +; SSE2-NEXT: .LBB85_16: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm7 +; SSE2-NEXT: addss %xmm7, %xmm7 +; SSE2-NEXT: .LBB85_18: +; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB85_19 +; SSE2-NEXT: # %bb.20: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: jmp .LBB85_21 +; SSE2-NEXT: .LBB85_19: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: .LBB85_21: +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB85_22 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ss %rax, %xmm2 +; SSE2-NEXT: jmp .LBB85_24 +; SSE2-NEXT: .LBB85_22: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ss %rax, %xmm2 +; SSE2-NEXT: addss %xmm2, %xmm2 +; SSE2-NEXT: .LBB85_24: +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_8i64_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm4 +; SSE41-NEXT: movdqa 32(%rdi), %xmm1 +; SSE41-NEXT: movdqa 48(%rdi), %xmm2 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB85_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: cvtsi2ss %rax, %xmm3 +; SSE41-NEXT: jmp .LBB85_3 +; SSE41-NEXT: .LBB85_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm3 +; SSE41-NEXT: addss %xmm3, %xmm3 +; SSE41-NEXT: .LBB85_3: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB85_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: jmp .LBB85_6 +; SSE41-NEXT: .LBB85_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: .LBB85_6: +; SSE41-NEXT: movq %xmm4, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB85_7 +; SSE41-NEXT: # %bb.8: +; SSE41-NEXT: cvtsi2ss %rax, %xmm5 +; SSE41-NEXT: jmp .LBB85_9 +; SSE41-NEXT: .LBB85_7: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm5 +; SSE41-NEXT: addss %xmm5, %xmm5 +; SSE41-NEXT: .LBB85_9: +; SSE41-NEXT: pextrq $1, %xmm4, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB85_10 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: cvtsi2ss %rax, %xmm4 +; SSE41-NEXT: jmp .LBB85_12 +; SSE41-NEXT: .LBB85_10: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: cvtsi2ss %rax, %xmm4 +; SSE41-NEXT: addss %xmm4, %xmm4 +; SSE41-NEXT: .LBB85_12: +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB85_13 +; SSE41-NEXT: # %bb.14: +; SSE41-NEXT: cvtsi2ss %rax, %xmm6 +; SSE41-NEXT: jmp .LBB85_15 +; SSE41-NEXT: .LBB85_13: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm6 +; SSE41-NEXT: addss %xmm6, %xmm6 +; SSE41-NEXT: .LBB85_15: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB85_16 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: jmp .LBB85_18 +; SSE41-NEXT: .LBB85_16: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: .LBB85_18: +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[2,3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0],xmm0[3] +; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB85_19 +; SSE41-NEXT: # %bb.20: +; SSE41-NEXT: xorps %xmm3, %xmm3 +; SSE41-NEXT: cvtsi2ss %rax, %xmm3 +; SSE41-NEXT: jmp .LBB85_21 +; SSE41-NEXT: .LBB85_19: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm3, %xmm3 +; SSE41-NEXT: cvtsi2ss %rax, %xmm3 +; SSE41-NEXT: addss %xmm3, %xmm3 +; SSE41-NEXT: .LBB85_21: +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] +; SSE41-NEXT: pextrq $1, %xmm2, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB85_22 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; SSE41-NEXT: retq +; SSE41-NEXT: .LBB85_22: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_load_8i64_to_8f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm1 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm0 +; VEX-NEXT: vmovdqa 32(%rdi), %xmm4 +; VEX-NEXT: vmovdqa 48(%rdi), %xmm3 +; VEX-NEXT: vpextrq $1, %xmm4, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB85_1 +; VEX-NEXT: # %bb.2: +; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; VEX-NEXT: jmp .LBB85_3 +; VEX-NEXT: .LBB85_1: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; VEX-NEXT: .LBB85_3: +; VEX-NEXT: vmovq %xmm4, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB85_4 +; VEX-NEXT: # %bb.5: +; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm5 +; VEX-NEXT: jmp .LBB85_6 +; VEX-NEXT: .LBB85_4: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm5 +; VEX-NEXT: .LBB85_6: +; VEX-NEXT: vmovq %xmm3, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB85_7 +; VEX-NEXT: # %bb.8: +; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 +; VEX-NEXT: jmp .LBB85_9 +; VEX-NEXT: .LBB85_7: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 +; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm4 +; VEX-NEXT: .LBB85_9: +; VEX-NEXT: vpextrq $1, %xmm3, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB85_10 +; VEX-NEXT: # %bb.11: +; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 +; VEX-NEXT: jmp .LBB85_12 +; VEX-NEXT: .LBB85_10: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 +; VEX-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; VEX-NEXT: .LBB85_12: +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB85_13 +; VEX-NEXT: # %bb.14: +; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm6 +; VEX-NEXT: jmp .LBB85_15 +; VEX-NEXT: .LBB85_13: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm6 +; VEX-NEXT: vaddss %xmm6, %xmm6, %xmm6 +; VEX-NEXT: .LBB85_15: +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[2,3] +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB85_16 +; VEX-NEXT: # %bb.17: +; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm1 +; VEX-NEXT: jmp .LBB85_18 +; VEX-NEXT: .LBB85_16: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm1 +; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; VEX-NEXT: .LBB85_18: +; VEX-NEXT: vinsertps {{.*#+}} xmm5 = xmm1[0],xmm6[0],xmm1[2,3] +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm4[0],xmm2[3] +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB85_19 +; VEX-NEXT: # %bb.20: +; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm2 +; VEX-NEXT: jmp .LBB85_21 +; VEX-NEXT: .LBB85_19: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm2 +; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; VEX-NEXT: .LBB85_21: +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0,1],xmm2[0],xmm5[3] +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB85_22 +; VEX-NEXT: # %bb.23: +; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm0 +; VEX-NEXT: jmp .LBB85_24 +; VEX-NEXT: .LBB85_22: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm0 +; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; VEX-NEXT: .LBB85_24: +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_8i64_to_8f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vpextrq $1, %xmm2, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512F-NEXT: vmovq %xmm3, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512F-NEXT: vpextrq $1, %xmm3, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_8i64_to_8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 +; AVX512VL-NEXT: vmovq %xmm2, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512VL-NEXT: vmovq %xmm3, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvtuqq2ps (%rdi), %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_8i64_to_8f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2ps (%rdi), %ymm0 +; AVX512VLDQ-NEXT: retq + %ld = load <8 x i64>, <8 x i64> *%a + %cvt = uitofp <8 x i64> %ld to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) { +; SSE2-LABEL: uitofp_load_8i32_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm6 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; SSE2-NEXT: addps %xmm6, %xmm0 +; SSE2-NEXT: addps %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: addps %xmm6, %xmm1 +; SSE2-NEXT: addps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_8i32_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] +; SSE41-NEXT: movaps {{.*#+}} xmm5 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; SSE41-NEXT: addps %xmm5, %xmm0 +; SSE41-NEXT: addps %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] +; SSE41-NEXT: addps %xmm5, %xmm1 +; SSE41-NEXT: addps %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_load_8i32_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_8i32_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_8i32_to_8f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %ymm0 +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_8i32_to_8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2ps (%rdi), %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_8i32_to_8f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_8i32_to_8f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2ps (%rdi), %ymm0 +; AVX512VLDQ-NEXT: retq + %ld = load <8 x i32>, <8 x i32> *%a + %cvt = uitofp <8 x i32> %ld to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) { +; SSE2-LABEL: uitofp_load_8i16_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_8i16_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_load_8i16_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_8i16_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: uitofp_load_8i16_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: retq + %ld = load <8 x i16>, <8 x i16> *%a + %cvt = uitofp <8 x i16> %ld to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) { +; SSE2-LABEL: uitofp_load_8i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_8i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_load_8i8_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_8i8_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: uitofp_load_8i8_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: retq + %ld = load <8 x i8>, <8 x i8> *%a + %cvt = uitofp <8 x i8> %ld to <8 x float> + ret <8 x float> %cvt +} + +; +; Aggregates +; + +%Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }> +define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) { +; SSE2-LABEL: aggregate_sitofp_8i16_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq 24(%rdi), %rax +; SSE2-NEXT: movdqu 8(%rdi), %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, 16(%rax) +; SSE2-NEXT: movaps %xmm1, (%rax) +; SSE2-NEXT: retq +; +; SSE41-LABEL: aggregate_sitofp_8i16_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movq 24(%rdi), %rax +; SSE41-NEXT: pmovsxwd 16(%rdi), %xmm0 +; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, 16(%rax) +; SSE41-NEXT: movaps %xmm1, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: movq 24(%rdi), %rax +; AVX1-NEXT: vpmovsxwd 16(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: movq 24(%rdi), %rax +; AVX2-NEXT: vpmovsxwd 8(%rdi), %ymm0 +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: aggregate_sitofp_8i16_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: movq 24(%rdi), %rax +; AVX512-NEXT: vpmovsxwd 8(%rdi), %ymm0 +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: vmovaps %ymm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = load %Arguments, %Arguments* %a0, align 1 + %2 = extractvalue %Arguments %1, 1 + %3 = extractvalue %Arguments %1, 2 + %4 = sitofp <8 x i16> %2 to <8 x float> + store <8 x float> %4, <8 x float>* %3, align 32 + ret void +} + +define <2 x double> @sitofp_i32_to_2f64(<2 x double> %a0, i32 %a1) nounwind { +; SSE-LABEL: sitofp_i32_to_2f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtsi2sd %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_i32_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp i32 %a1 to double + %res = insertelement <2 x double> %a0, double %cvt, i32 0 + ret <2 x double> %res +} + +define <4 x float> @sitofp_i32_to_4f32(<4 x float> %a0, i32 %a1) nounwind { +; SSE-LABEL: sitofp_i32_to_4f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtsi2ss %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_i32_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp i32 %a1 to float + %res = insertelement <4 x float> %a0, float %cvt, i32 0 + ret <4 x float> %res +} + +define <2 x double> @sitofp_i64_to_2f64(<2 x double> %a0, i64 %a1) nounwind { +; SSE-LABEL: sitofp_i64_to_2f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtsi2sd %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_i64_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp i64 %a1 to double + %res = insertelement <2 x double> %a0, double %cvt, i32 0 + ret <2 x double> %res +} + +define <4 x float> @sitofp_i64_to_4f32(<4 x float> %a0, i64 %a1) nounwind { +; SSE-LABEL: sitofp_i64_to_4f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtsi2ss %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_i64_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp i64 %a1 to float + %res = insertelement <4 x float> %a0, float %cvt, i32 0 + ret <4 x float> %res +} + +; Extract from int vector and convert to FP. + +define float @extract0_sitofp_v4i32_f32(<4 x i32> %x) nounwind { +; SSE-LABEL: extract0_sitofp_v4i32_f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: extract0_sitofp_v4i32_f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %e = extractelement <4 x i32> %x, i32 0 + %r = sitofp i32 %e to float + ret float %r +} + +define float @extract0_sitofp_v4i32_f32i_multiuse1(<4 x i32> %x) nounwind { +; SSE-LABEL: extract0_sitofp_v4i32_f32i_multiuse1: +; SSE: # %bb.0: +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE-NEXT: incl %eax +; SSE-NEXT: cvtsi2ss %eax, %xmm1 +; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: extract0_sitofp_v4i32_f32i_multiuse1: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: incl %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1 +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %e = extractelement <4 x i32> %x, i32 0 + %f = sitofp i32 %e to float + %e1 = add i32 %e, 1 + %f1 = sitofp i32 %e1 to float + %r = fdiv float %f, %f1 + ret float %r +} + +define float @extract0_sitofp_v4i32_f32_multiuse2(<4 x i32> %x, i32* %p) nounwind { +; SSE-LABEL: extract0_sitofp_v4i32_f32_multiuse2: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE-NEXT: movss %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: extract0_sitofp_v4i32_f32_multiuse2: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm1 +; AVX-NEXT: vmovss %xmm0, (%rdi) +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: retq + %e = extractelement <4 x i32> %x, i32 0 + %r = sitofp i32 %e to float + store i32 %e, i32* %p + ret float %r +} + +define double @extract0_sitofp_v4i32_f64(<4 x i32> %x) nounwind { +; SSE-LABEL: extract0_sitofp_v4i32_f64: +; SSE: # %bb.0: +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2sd %eax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: extract0_sitofp_v4i32_f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %e = extractelement <4 x i32> %x, i32 0 + %r = sitofp i32 %e to double + ret double %r +} + +define float @extract0_uitofp_v4i32_f32(<4 x i32> %x) nounwind { +; SSE-LABEL: extract0_uitofp_v4i32_f32: +; SSE: # %bb.0: +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2ss %rax, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: extract0_uitofp_v4i32_f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovd %xmm0, %eax +; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: extract0_uitofp_v4i32_f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract0_uitofp_v4i32_f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract0_uitofp_v4i32_f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %e = extractelement <4 x i32> %x, i32 0 + %r = uitofp i32 %e to float + ret float %r +} + +define double @extract0_uitofp_v4i32_f64(<4 x i32> %x) nounwind { +; SSE-LABEL: extract0_uitofp_v4i32_f64: +; SSE: # %bb.0: +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2sd %rax, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: extract0_uitofp_v4i32_f64: +; VEX: # %bb.0: +; VEX-NEXT: vmovd %xmm0, %eax +; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: extract0_uitofp_v4i32_f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract0_uitofp_v4i32_f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract0_uitofp_v4i32_f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %e = extractelement <4 x i32> %x, i32 0 + %r = uitofp i32 %e to double + ret double %r +} + +; Extract non-zero element from int vector and convert to FP. + +define float @extract3_sitofp_v4i32_f32(<4 x i32> %x) nounwind { +; SSE-LABEL: extract3_sitofp_v4i32_f32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: extract3_sitofp_v4i32_f32: +; AVX: # %bb.0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %e = extractelement <4 x i32> %x, i32 3 + %r = sitofp i32 %e to float + ret float %r +} + +define double @extract3_sitofp_v4i32_f64(<4 x i32> %x) nounwind { +; SSE2-LABEL: extract3_sitofp_v4i32_f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2sd %eax, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract3_sitofp_v4i32_f64: +; SSE41: # %bb.0: +; SSE41-NEXT: extractps $3, %xmm0, %eax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2sd %eax, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: extract3_sitofp_v4i32_f64: +; AVX: # %bb.0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %e = extractelement <4 x i32> %x, i32 3 + %r = sitofp i32 %e to double + ret double %r +} + +define float @extract3_uitofp_v4i32_f32(<4 x i32> %x) nounwind { +; SSE2-LABEL: extract3_uitofp_v4i32_f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract3_uitofp_v4i32_f32: +; SSE41: # %bb.0: +; SSE41-NEXT: extractps $3, %xmm0, %eax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: extract3_uitofp_v4i32_f32: +; VEX: # %bb.0: +; VEX-NEXT: vextractps $3, %xmm0, %eax +; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: extract3_uitofp_v4i32_f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract3_uitofp_v4i32_f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract3_uitofp_v4i32_f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %e = extractelement <4 x i32> %x, i32 3 + %r = uitofp i32 %e to float + ret float %r +} + +define double @extract3_uitofp_v4i32_f64(<4 x i32> %x) nounwind { +; SSE2-LABEL: extract3_uitofp_v4i32_f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2sd %rax, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract3_uitofp_v4i32_f64: +; SSE41: # %bb.0: +; SSE41-NEXT: extractps $3, %xmm0, %eax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2sd %rax, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: extract3_uitofp_v4i32_f64: +; VEX: # %bb.0: +; VEX-NEXT: vextractps $3, %xmm0, %eax +; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: extract3_uitofp_v4i32_f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract3_uitofp_v4i32_f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract3_uitofp_v4i32_f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %e = extractelement <4 x i32> %x, i32 3 + %r = uitofp i32 %e to double + ret double %r +} + diff --git a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll index 6f6776ea89e..a76c4d10afa 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=X64_WIDEN +; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=X86_WIDEN define void @test_udiv7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X64-LABEL: test_udiv7_v2i32: @@ -41,6 +43,45 @@ define void @test_udiv7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X86-NEXT: psrld $2, %xmm0 ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl +; +; X64_WIDEN-LABEL: test_udiv7_v2i32: +; X64_WIDEN: # %bb.0: +; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; X64_WIDEN-NEXT: movdqa %xmm0, %xmm2 +; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm2 +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm3 +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64_WIDEN-NEXT: psubd %xmm2, %xmm0 +; X64_WIDEN-NEXT: psrld $1, %xmm0 +; X64_WIDEN-NEXT: paddd %xmm2, %xmm0 +; X64_WIDEN-NEXT: psrld $2, %xmm0 +; X64_WIDEN-NEXT: movq %xmm0, (%rsi) +; X64_WIDEN-NEXT: retq +; +; X86_WIDEN-LABEL: test_udiv7_v2i32: +; X86_WIDEN: # %bb.0: +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm2 +; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm2 +; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm3 +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] +; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm3 +; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86_WIDEN-NEXT: psubd %xmm2, %xmm0 +; X86_WIDEN-NEXT: psrld $1, %xmm0 +; X86_WIDEN-NEXT: paddd %xmm2, %xmm0 +; X86_WIDEN-NEXT: psrld $2, %xmm0 +; X86_WIDEN-NEXT: movq %xmm0, (%eax) +; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = udiv <2 x i32> %a, <i32 7, i32 7> store <2 x i32> %b, <2 x i32>* %y @@ -96,6 +137,55 @@ define void @test_urem7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X86-NEXT: paddd %xmm0, %xmm1 ; X86-NEXT: movq %xmm1, (%eax) ; X86-NEXT: retl +; +; X64_WIDEN-LABEL: test_urem7_v2i32: +; X64_WIDEN: # %bb.0: +; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; X64_WIDEN-NEXT: movdqa %xmm0, %xmm2 +; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm2 +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm3 +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64_WIDEN-NEXT: movdqa %xmm0, %xmm1 +; X64_WIDEN-NEXT: psubd %xmm2, %xmm1 +; X64_WIDEN-NEXT: psrld $1, %xmm1 +; X64_WIDEN-NEXT: paddd %xmm2, %xmm1 +; X64_WIDEN-NEXT: psrld $2, %xmm1 +; X64_WIDEN-NEXT: movdqa %xmm1, %xmm2 +; X64_WIDEN-NEXT: pslld $3, %xmm2 +; X64_WIDEN-NEXT: psubd %xmm2, %xmm1 +; X64_WIDEN-NEXT: paddd %xmm0, %xmm1 +; X64_WIDEN-NEXT: movq %xmm1, (%rsi) +; X64_WIDEN-NEXT: retq +; +; X86_WIDEN-LABEL: test_urem7_v2i32: +; X86_WIDEN: # %bb.0: +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm2 +; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm2 +; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm3 +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] +; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm3 +; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm1 +; X86_WIDEN-NEXT: psubd %xmm2, %xmm1 +; X86_WIDEN-NEXT: psrld $1, %xmm1 +; X86_WIDEN-NEXT: paddd %xmm2, %xmm1 +; X86_WIDEN-NEXT: psrld $2, %xmm1 +; X86_WIDEN-NEXT: movdqa %xmm1, %xmm2 +; X86_WIDEN-NEXT: pslld $3, %xmm2 +; X86_WIDEN-NEXT: psubd %xmm2, %xmm1 +; X86_WIDEN-NEXT: paddd %xmm0, %xmm1 +; X86_WIDEN-NEXT: movq %xmm1, (%eax) +; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = urem <2 x i32> %a, <i32 7, i32 7> store <2 x i32> %b, <2 x i32>* %y @@ -153,6 +243,57 @@ define void @test_sdiv7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X86-NEXT: paddd %xmm0, %xmm2 ; X86-NEXT: movq %xmm2, (%eax) ; X86-NEXT: retl +; +; X64_WIDEN-LABEL: test_sdiv7_v2i32: +; X64_WIDEN: # %bb.0: +; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; X64_WIDEN-NEXT: movdqa %xmm0, %xmm2 +; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm2 +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm3 +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64_WIDEN-NEXT: pxor %xmm3, %xmm3 +; X64_WIDEN-NEXT: pcmpgtd %xmm0, %xmm3 +; X64_WIDEN-NEXT: pand %xmm1, %xmm3 +; X64_WIDEN-NEXT: paddd %xmm0, %xmm3 +; X64_WIDEN-NEXT: psubd %xmm3, %xmm2 +; X64_WIDEN-NEXT: paddd %xmm0, %xmm2 +; X64_WIDEN-NEXT: movdqa %xmm2, %xmm0 +; X64_WIDEN-NEXT: psrld $31, %xmm0 +; X64_WIDEN-NEXT: psrad $2, %xmm2 +; X64_WIDEN-NEXT: paddd %xmm0, %xmm2 +; X64_WIDEN-NEXT: movq %xmm2, (%rsi) +; X64_WIDEN-NEXT: retq +; +; X86_WIDEN-LABEL: test_sdiv7_v2i32: +; X86_WIDEN: # %bb.0: +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm2 +; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm2 +; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm3 +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] +; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm3 +; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86_WIDEN-NEXT: pxor %xmm3, %xmm3 +; X86_WIDEN-NEXT: pcmpgtd %xmm0, %xmm3 +; X86_WIDEN-NEXT: pand %xmm1, %xmm3 +; X86_WIDEN-NEXT: paddd %xmm0, %xmm3 +; X86_WIDEN-NEXT: psubd %xmm3, %xmm2 +; X86_WIDEN-NEXT: paddd %xmm0, %xmm2 +; X86_WIDEN-NEXT: movdqa %xmm2, %xmm0 +; X86_WIDEN-NEXT: psrld $31, %xmm0 +; X86_WIDEN-NEXT: psrad $2, %xmm2 +; X86_WIDEN-NEXT: paddd %xmm0, %xmm2 +; X86_WIDEN-NEXT: movq %xmm2, (%eax) +; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = sdiv <2 x i32> %a, <i32 7, i32 7> store <2 x i32> %b, <2 x i32>* %y @@ -218,6 +359,65 @@ define void @test_srem7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X86-NEXT: paddd %xmm0, %xmm2 ; X86-NEXT: movq %xmm2, (%eax) ; X86-NEXT: retl +; +; X64_WIDEN-LABEL: test_srem7_v2i32: +; X64_WIDEN: # %bb.0: +; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; X64_WIDEN-NEXT: movdqa %xmm0, %xmm2 +; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm2 +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm3 +; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64_WIDEN-NEXT: pxor %xmm3, %xmm3 +; X64_WIDEN-NEXT: pcmpgtd %xmm0, %xmm3 +; X64_WIDEN-NEXT: pand %xmm1, %xmm3 +; X64_WIDEN-NEXT: paddd %xmm0, %xmm3 +; X64_WIDEN-NEXT: psubd %xmm3, %xmm2 +; X64_WIDEN-NEXT: paddd %xmm0, %xmm2 +; X64_WIDEN-NEXT: movdqa %xmm2, %xmm1 +; X64_WIDEN-NEXT: psrld $31, %xmm1 +; X64_WIDEN-NEXT: psrad $2, %xmm2 +; X64_WIDEN-NEXT: paddd %xmm1, %xmm2 +; X64_WIDEN-NEXT: movdqa %xmm2, %xmm1 +; X64_WIDEN-NEXT: pslld $3, %xmm1 +; X64_WIDEN-NEXT: psubd %xmm1, %xmm2 +; X64_WIDEN-NEXT: paddd %xmm0, %xmm2 +; X64_WIDEN-NEXT: movq %xmm2, (%rsi) +; X64_WIDEN-NEXT: retq +; +; X86_WIDEN-LABEL: test_srem7_v2i32: +; X86_WIDEN: # %bb.0: +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm2 +; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm2 +; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X86_WIDEN-NEXT: movdqa %xmm0, %xmm3 +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] +; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm3 +; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86_WIDEN-NEXT: pxor %xmm3, %xmm3 +; X86_WIDEN-NEXT: pcmpgtd %xmm0, %xmm3 +; X86_WIDEN-NEXT: pand %xmm1, %xmm3 +; X86_WIDEN-NEXT: paddd %xmm0, %xmm3 +; X86_WIDEN-NEXT: psubd %xmm3, %xmm2 +; X86_WIDEN-NEXT: paddd %xmm0, %xmm2 +; X86_WIDEN-NEXT: movdqa %xmm2, %xmm1 +; X86_WIDEN-NEXT: psrld $31, %xmm1 +; X86_WIDEN-NEXT: psrad $2, %xmm2 +; X86_WIDEN-NEXT: paddd %xmm1, %xmm2 +; X86_WIDEN-NEXT: movdqa %xmm2, %xmm1 +; X86_WIDEN-NEXT: pslld $3, %xmm1 +; X86_WIDEN-NEXT: psubd %xmm1, %xmm2 +; X86_WIDEN-NEXT: paddd %xmm0, %xmm2 +; X86_WIDEN-NEXT: movq %xmm2, (%eax) +; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = srem <2 x i32> %a, <i32 7, i32 7> store <2 x i32> %b, <2 x i32>* %y @@ -240,6 +440,22 @@ define void @test_udiv_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X86-NEXT: psrld $3, %xmm0 ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl +; +; X64_WIDEN-LABEL: test_udiv_pow2_v2i32: +; X64_WIDEN: # %bb.0: +; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64_WIDEN-NEXT: psrld $3, %xmm0 +; X64_WIDEN-NEXT: movq %xmm0, (%rsi) +; X64_WIDEN-NEXT: retq +; +; X86_WIDEN-LABEL: test_udiv_pow2_v2i32: +; X86_WIDEN: # %bb.0: +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: psrld $3, %xmm0 +; X86_WIDEN-NEXT: movq %xmm0, (%eax) +; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = udiv <2 x i32> %a, <i32 8, i32 8> store <2 x i32> %b, <2 x i32>* %y diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-widen.ll b/llvm/test/CodeGen/X86/vector-reduce-add-widen.ll new file mode 100644 index 00000000000..0841acfc41a --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-reduce-add-widen.ll @@ -0,0 +1,1386 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL + +; +; vXi64 +; + +define i64 @test_v2i64(<2 x i64> %a0) { +; SSE-LABEL: test_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: paddq %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v4i64(<4 x i64> %a0) { +; SSE-LABEL: test_v4i64: +; SSE: # %bb.0: +; SSE-NEXT: paddq %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: paddq %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v8i64(<8 x i64> %a0) { +; SSE-LABEL: test_v8i64: +; SSE: # %bb.0: +; SSE-NEXT: paddq %xmm3, %xmm1 +; SSE-NEXT: paddq %xmm2, %xmm1 +; SSE-NEXT: paddq %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: paddq %xmm1, %xmm0 +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v8i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v16i64(<16 x i64> %a0) { +; SSE-LABEL: test_v16i64: +; SSE: # %bb.0: +; SSE-NEXT: paddq %xmm6, %xmm2 +; SSE-NEXT: paddq %xmm7, %xmm3 +; SSE-NEXT: paddq %xmm5, %xmm3 +; SSE-NEXT: paddq %xmm1, %xmm3 +; SSE-NEXT: paddq %xmm4, %xmm2 +; SSE-NEXT: paddq %xmm3, %xmm2 +; SSE-NEXT: paddq %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: paddq %xmm2, %xmm0 +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v16i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %a0) + ret i64 %1 +} + +; +; vXi32 +; + +define i32 @test_v2i32(<2 x i32> %a0) { +; SSE-LABEL: test_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: retq +; +; AVX1-SLOW-LABEL: test_v2i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v2i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v4i32(<4 x i32> %a0) { +; SSE-LABEL: test_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: retq +; +; AVX1-SLOW-LABEL: test_v4i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v4i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v8i32(<8 x i32> %a0) { +; SSE-LABEL: test_v8i32: +; SSE: # %bb.0: +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: retq +; +; AVX1-SLOW-LABEL: test_v8i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v8i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v16i32(<16 x i32> %a0) { +; SSE-LABEL: test_v16i32: +; SSE: # %bb.0: +; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: retq +; +; AVX1-SLOW-LABEL: test_v16i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v16i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v32i32(<32 x i32> %a0) { +; SSE-LABEL: test_v32i32: +; SSE: # %bb.0: +; SSE-NEXT: paddd %xmm6, %xmm2 +; SSE-NEXT: paddd %xmm7, %xmm3 +; SSE-NEXT: paddd %xmm5, %xmm3 +; SSE-NEXT: paddd %xmm1, %xmm3 +; SSE-NEXT: paddd %xmm4, %xmm2 +; SSE-NEXT: paddd %xmm3, %xmm2 +; SSE-NEXT: paddd %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: retq +; +; AVX1-SLOW-LABEL: test_v32i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm4 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v32i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm4 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v32i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> %a0) + ret i32 %1 +} + +; +; vXi16 +; + +define i16 @test_v2i16(<2 x i16> %a0) { +; SSE-LABEL: test_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-SLOW-LABEL: test_v2i16: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v2i16: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v4i16(<4 x i16> %a0) { +; SSE-LABEL: test_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-SLOW-LABEL: test_v4i16: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v4i16: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v8i16(<8 x i16> %a0) { +; SSE-LABEL: test_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-SLOW-LABEL: test_v8i16: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v8i16: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v16i16(<16 x i16> %a0) { +; SSE-LABEL: test_v16i16: +; SSE: # %bb.0: +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-SLOW-LABEL: test_v16i16: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v16i16: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v32i16(<32 x i16> %a0) { +; SSE-LABEL: test_v32i16: +; SSE: # %bb.0: +; SSE-NEXT: paddw %xmm3, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-SLOW-LABEL: test_v32i16: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v32i16: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v64i16(<64 x i16> %a0) { +; SSE-LABEL: test_v64i16: +; SSE: # %bb.0: +; SSE-NEXT: paddw %xmm6, %xmm2 +; SSE-NEXT: paddw %xmm7, %xmm3 +; SSE-NEXT: paddw %xmm5, %xmm3 +; SSE-NEXT: paddw %xmm1, %xmm3 +; SSE-NEXT: paddw %xmm4, %xmm2 +; SSE-NEXT: paddw %xmm3, %xmm2 +; SSE-NEXT: paddw %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: paddw %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-SLOW-LABEL: test_v64i16: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm4 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm2, %xmm2 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v64i16: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm4 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1 +; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm2, %xmm2 +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v64i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v64i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> %a0) + ret i16 %1 +} + +; +; vXi8 +; + +define i8 @test_v2i8(<2 x i8> %a0) { +; SSE2-LABEL: test_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v4i8(<4 x i8> %a0) { +; SSE2-LABEL: test_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: paddb %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v8i8(<8 x i8> %a0) { +; SSE2-LABEL: test_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: paddb %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v16i8(<16 x i8> %a0) { +; SSE2-LABEL: test_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: psadbw %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v32i8(<32 x i8> %a0) { +; SSE2-LABEL: test_v32i8: +; SSE2: # %bb.0: +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: paddb %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: psadbw %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v64i8(<64 x i8> %a0) { +; SSE2-LABEL: test_v64i8: +; SSE2: # %bb.0: +; SSE2-NEXT: paddb %xmm3, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v64i8: +; SSE41: # %bb.0: +; SSE41-NEXT: paddb %xmm3, %xmm1 +; SSE41-NEXT: paddb %xmm2, %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: paddb %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: psadbw %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v64i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v128i8(<128 x i8> %a0) { +; SSE2-LABEL: test_v128i8: +; SSE2: # %bb.0: +; SSE2-NEXT: paddb %xmm7, %xmm3 +; SSE2-NEXT: paddb %xmm5, %xmm3 +; SSE2-NEXT: paddb %xmm1, %xmm3 +; SSE2-NEXT: paddb %xmm6, %xmm2 +; SSE2-NEXT: paddb %xmm4, %xmm2 +; SSE2-NEXT: paddb %xmm3, %xmm2 +; SSE2-NEXT: paddb %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v128i8: +; SSE41: # %bb.0: +; SSE41-NEXT: paddb %xmm7, %xmm3 +; SSE41-NEXT: paddb %xmm5, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm3 +; SSE41-NEXT: paddb %xmm6, %xmm2 +; SSE41-NEXT: paddb %xmm4, %xmm2 +; SSE41-NEXT: paddb %xmm3, %xmm2 +; SSE41-NEXT: paddb %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: paddb %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: psadbw %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v128i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v128i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v128i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> %a0) + ret i8 %1 +} + +declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-widen.ll b/llvm/test/CodeGen/X86/vector-reduce-and-widen.ll new file mode 100644 index 00000000000..d169f0cb138 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-reduce-and-widen.ll @@ -0,0 +1,1168 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 + +; +; vXi64 +; + +define i64 @test_v2i64(<2 x i64> %a0) { +; SSE-LABEL: test_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v4i64(<4 x i64> %a0) { +; SSE-LABEL: test_v4i64: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v8i64(<8 x i64> %a0) { +; SSE-LABEL: test_v8i64: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v8i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v16i64(<16 x i64> %a0) { +; SSE-LABEL: test_v16i64: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v16i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> %a0) + ret i64 %1 +} + +; +; vXi32 +; + +define i32 @test_v2i32(<2 x i32> %a0) { +; SSE-LABEL: test_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v4i32(<4 x i32> %a0) { +; SSE-LABEL: test_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v8i32(<8 x i32> %a0) { +; SSE-LABEL: test_v8i32: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v16i32(<16 x i32> %a0) { +; SSE-LABEL: test_v16i32: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v16i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v32i32(<32 x i32> %a0) { +; SSE-LABEL: test_v32i32: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v32i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> %a0) + ret i32 %1 +} + +; +; vXi16 +; + +define i16 @test_v2i16(<2 x i16> %a0) { +; SSE-LABEL: test_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v4i16(<4 x i16> %a0) { +; SSE-LABEL: test_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v8i16(<8 x i16> %a0) { +; SSE-LABEL: test_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v16i16(<16 x i16> %a0) { +; SSE-LABEL: test_v16i16: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v32i16(<32 x i16> %a0) { +; SSE-LABEL: test_v32i16: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v32i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v64i16(<64 x i16> %a0) { +; SSE-LABEL: test_v64i16: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v64i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v64i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> %a0) + ret i16 %1 +} + +; +; vXi8 +; + +define i8 @test_v2i8(<2 x i8> %a0) { +; SSE2-LABEL: test_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v4i8(<4 x i8> %a0) { +; SSE2-LABEL: test_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v8i8(<8 x i8> %a0) { +; SSE2-LABEL: test_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v16i8(<16 x i8> %a0) { +; SSE2-LABEL: test_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v32i8(<32 x i8> %a0) { +; SSE2-LABEL: test_v32i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v64i8(<64 x i8> %a0) { +; SSE2-LABEL: test_v64i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v64i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v64i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v128i8(<128 x i8> %a0) { +; SSE2-LABEL: test_v128i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v128i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pand %xmm6, %xmm2 +; SSE41-NEXT: pand %xmm7, %xmm3 +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: pand %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v128i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v128i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v128i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> %a0) + ret i8 %1 +} + +declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll b/llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll new file mode 100644 index 00000000000..8242160f8ff --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll @@ -0,0 +1,3022 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512BWVL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512DQVL + +; +; vXi64 +; + +define i64 @test_v2i64(<2 x i64> %a0) { +; SSE-LABEL: test_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pmuludq %xmm0, %xmm3 +; SSE-NEXT: paddq %xmm2, %xmm3 +; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq +; +; AVX512BW-LABEL: test_v2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: test_v2i64: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, %rax +; AVX512BWVL-NEXT: retq +; +; AVX512DQ-LABEL: test_v2i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovq %xmm0, %rax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512DQVL-LABEL: test_v2i64: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vmovq %xmm0, %rax +; AVX512DQVL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v4i64(<4 x i64> %a0) { +; SSE-LABEL: test_v4i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: psrlq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm0, %xmm3 +; SSE-NEXT: paddq %xmm2, %xmm3 +; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pmuludq %xmm0, %xmm3 +; SSE-NEXT: paddq %xmm2, %xmm3 +; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: test_v4i64: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, %rax +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512DQ-LABEL: test_v4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovq %xmm0, %rax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512DQVL-LABEL: test_v4i64: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vmovq %xmm0, %rax +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v8i64(<8 x i64> %a0) { +; SSE-LABEL: test_v8i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: psrlq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm1, %xmm5 +; SSE-NEXT: paddq %xmm4, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: paddq %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrlq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm0, %xmm4 +; SSE-NEXT: paddq %xmm3, %xmm4 +; SSE-NEXT: psllq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: paddq %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: psrlq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm0, %xmm3 +; SSE-NEXT: paddq %xmm2, %xmm3 +; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pmuludq %xmm0, %xmm3 +; SSE-NEXT: paddq %xmm2, %xmm3 +; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v8i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 +; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 +; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 +; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1 +; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm3 +; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 +; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 +; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 +; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v8i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 +; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 +; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: test_v8i64: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 +; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 +; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 +; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 +; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 +; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, %rax +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512DQ-LABEL: test_v8i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovq %xmm0, %rax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512DQVL-LABEL: test_v8i64: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vmovq %xmm0, %rax +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v16i64(<16 x i64> %a0) { +; SSE-LABEL: test_v16i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: pmuludq %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: psrlq $32, %xmm9 +; SSE-NEXT: pmuludq %xmm2, %xmm9 +; SSE-NEXT: paddq %xmm8, %xmm9 +; SSE-NEXT: psllq $32, %xmm9 +; SSE-NEXT: pmuludq %xmm6, %xmm2 +; SSE-NEXT: paddq %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: pmuludq %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: psrlq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm0, %xmm6 +; SSE-NEXT: paddq %xmm8, %xmm6 +; SSE-NEXT: psllq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm4, %xmm0 +; SSE-NEXT: paddq %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: psrlq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm3, %xmm6 +; SSE-NEXT: paddq %xmm4, %xmm6 +; SSE-NEXT: psllq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm7, %xmm3 +; SSE-NEXT: paddq %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: psrlq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm1, %xmm6 +; SSE-NEXT: paddq %xmm4, %xmm6 +; SSE-NEXT: psllq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm5, %xmm1 +; SSE-NEXT: paddq %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: psrlq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm1, %xmm5 +; SSE-NEXT: paddq %xmm4, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: paddq %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrlq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm0, %xmm4 +; SSE-NEXT: paddq %xmm3, %xmm4 +; SSE-NEXT: psllq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: paddq %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: psrlq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm0, %xmm3 +; SSE-NEXT: paddq %xmm2, %xmm3 +; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pmuludq %xmm0, %xmm3 +; SSE-NEXT: paddq %xmm2, %xmm3 +; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v16i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 +; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5 +; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 +; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5 +; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5 +; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 +; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 +; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm6 +; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm6 +; AVX1-NEXT: vpmuludq %xmm3, %xmm6, %xmm6 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm7 +; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm7 +; AVX1-NEXT: vpaddq %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 +; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 +; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 +; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm1 +; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm2 +; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2 +; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 +; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm4 +; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm4 +; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5 +; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5 +; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 +; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 +; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 +; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4 +; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4 +; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 +; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 +; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 +; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 +; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v16i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 +; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 +; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 +; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 +; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: test_v16i64: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 +; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 +; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 +; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 +; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 +; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 +; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 +; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 +; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 +; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 +; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, %rax +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512DQ-LABEL: test_v16i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovq %xmm0, %rax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512DQVL-LABEL: test_v16i64: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vmovq %xmm0, %rax +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> %a0) + ret i64 %1 +} + +; +; vXi32 +; + +define i32 @test_v2i32(<2 x i32> %a0) { +; SSE2-LABEL: test_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmulld %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v4i32(<4 x i32> %a0) { +; SSE2-LABEL: test_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm3 +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pmulld %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v8i32(<8 x i32> %a0) { +; SSE2-LABEL: test_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm3 +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,0,0] +; SSE2-NEXT: pmuludq %xmm3, %xmm0 +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pmulld %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v16i32(<16 x i32> %a0) { +; SSE2-LABEL: test_v16i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm2 +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,0,0] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmulld %xmm3, %xmm1 +; SSE41-NEXT: pmulld %xmm2, %xmm1 +; SSE41-NEXT: pmulld %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmulld %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v16i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v32i32(<32 x i32> %a0) { +; SSE2-LABEL: test_v32i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm10 +; SSE2-NEXT: pmuludq %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm11 +; SSE2-NEXT: pmuludq %xmm9, %xmm11 +; SSE2-NEXT: pmuludq %xmm10, %xmm11 +; SSE2-NEXT: pmuludq %xmm6, %xmm2 +; SSE2-NEXT: pmuludq %xmm4, %xmm0 +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pmuludq %xmm7, %xmm3 +; SSE2-NEXT: pmuludq %xmm5, %xmm1 +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,0,0] +; SSE2-NEXT: pmuludq %xmm11, %xmm1 +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmulld %xmm6, %xmm2 +; SSE41-NEXT: pmulld %xmm7, %xmm3 +; SSE41-NEXT: pmulld %xmm5, %xmm3 +; SSE41-NEXT: pmulld %xmm1, %xmm3 +; SSE41-NEXT: pmulld %xmm4, %xmm2 +; SSE41-NEXT: pmulld %xmm3, %xmm2 +; SSE41-NEXT: pmulld %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pmulld %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmulld %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> %a0) + ret i32 %1 +} + +; +; vXi16 +; + +define i16 @test_v2i16(<2 x i16> %a0) { +; SSE-LABEL: test_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pmullw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v4i16(<4 x i16> %a0) { +; SSE-LABEL: test_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pmullw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pmullw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v8i16(<8 x i16> %a0) { +; SSE-LABEL: test_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pmullw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: pmullw %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pmullw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v16i16(<16 x i16> %a0) { +; SSE-LABEL: test_v16i16: +; SSE: # %bb.0: +; SSE-NEXT: pmullw %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pmullw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: pmullw %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pmullw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v32i16(<32 x i16> %a0) { +; SSE-LABEL: test_v32i16: +; SSE: # %bb.0: +; SSE-NEXT: pmullw %xmm3, %xmm1 +; SSE-NEXT: pmullw %xmm2, %xmm1 +; SSE-NEXT: pmullw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pmullw %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pmullw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pmullw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v32i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v32i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: test_v32i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, %eax +; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512DQ-LABEL: test_v32i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovd %xmm0, %eax +; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512DQVL-LABEL: test_v32i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vmovd %xmm0, %eax +; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v64i16(<64 x i16> %a0) { +; SSE-LABEL: test_v64i16: +; SSE: # %bb.0: +; SSE-NEXT: pmullw %xmm6, %xmm2 +; SSE-NEXT: pmullw %xmm7, %xmm3 +; SSE-NEXT: pmullw %xmm5, %xmm3 +; SSE-NEXT: pmullw %xmm1, %xmm3 +; SSE-NEXT: pmullw %xmm4, %xmm2 +; SSE-NEXT: pmullw %xmm3, %xmm2 +; SSE-NEXT: pmullw %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pmullw %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pmullw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pmullw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v64i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v64i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: test_v64i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, %eax +; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512DQ-LABEL: test_v64i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovd %xmm0, %eax +; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512DQVL-LABEL: test_v64i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1 +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vmovd %xmm0, %eax +; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> %a0) + ret i16 %1 +} + +; +; vXi8 +; + +define i8 @test_v2i8(<2 x i8> %a0) { +; SSE2-LABEL: test_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pmullw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pmullw %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v4i8(<4 x i8> %a0) { +; SSE2-LABEL: test_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pmullw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmullw %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,xmm1[6],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero +; SSE41-NEXT: pmullw %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v8i8(<8 x i8> %a0) { +; SSE2-LABEL: test_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pmullw %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,2,3,0] +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pmullw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmullw %xmm1, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmullw %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,xmm1[6],zero,xmm1[10],zero,xmm1[14],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero +; SSE41-NEXT: pmullw %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512BW-LABEL: test_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpmullw %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpmullw %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3] +; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero +; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512VL-NEXT: # kill: def $al killed $al killed $eax +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: test_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpmullw %xmm0, %xmm1, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax +; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax +; AVX512DQ-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v16i8(<16 x i8> %a0) { +; SSE2-LABEL: test_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: pmullw %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: pmullw %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm0, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmullw %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmullw %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pmullw %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,xmm0[4],zero,xmm0[6],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: test_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512DQ-LABEL: test_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax +; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512DQVL-LABEL: test_v16i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQVL-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQVL-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQVL-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v32i8(<32 x i8> %a0) { +; SSE2-LABEL: test_v32i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: pmullw %xmm2, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: pmullw %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: pmullw %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm0, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm2, %xmm3 +; SSE41-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: pmullw %xmm1, %xmm3 +; SSE41-NEXT: pmullw %xmm0, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: packuswb %xmm0, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; SSE41-NEXT: pmullw %xmm1, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: packuswb %xmm0, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; SSE41-NEXT: pmullw %xmm1, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: packuswb %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pmullw %xmm3, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: test_v32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512DQ-LABEL: test_v32i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax +; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512DQVL-LABEL: test_v32i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v64i8(<64 x i8> %a0) { +; SSE2-LABEL: test_v64i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: pmullw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: pmullw %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: pmullw %xmm1, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: pmullw %xmm4, %xmm0 +; SSE2-NEXT: pmullw %xmm5, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: pmullw %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: pmullw %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm0, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v64i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE41-NEXT: pmullw %xmm3, %xmm1 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE41-NEXT: pmullw %xmm1, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm3, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm3, %xmm6 +; SSE41-NEXT: pmullw %xmm4, %xmm5 +; SSE41-NEXT: pshufb %xmm3, %xmm5 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; SSE41-NEXT: pmullw %xmm4, %xmm5 +; SSE41-NEXT: pshufb %xmm3, %xmm5 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; SSE41-NEXT: pmullw %xmm2, %xmm3 +; SSE41-NEXT: pmullw %xmm0, %xmm3 +; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: packuswb %xmm0, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; SSE41-NEXT: pmullw %xmm2, %xmm3 +; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: packuswb %xmm0, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; SSE41-NEXT: pmullw %xmm2, %xmm3 +; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: packuswb %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pmullw %xmm3, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v64i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmullw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmullw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v64i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] +; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BW-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm1 +; AVX512BW-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: test_v64i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] +; AVX512BWVL-NEXT: vpmullw %zmm3, %zmm2, %zmm2 +; AVX512BWVL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512DQ-LABEL: test_v64i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax +; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512DQVL-LABEL: test_v64i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2 +; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQVL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v128i8(<128 x i8> %a0) { +; SSE2-LABEL: test_v128i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15] +; SSE2-NEXT: pmullw %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm4, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] +; SSE2-NEXT: pmullw %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm3, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] +; SSE2-NEXT: pmullw %xmm8, %xmm11 +; SSE2-NEXT: movdqa %xmm5, %xmm12 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm0[8],xmm12[9],xmm0[9],xmm12[10],xmm0[10],xmm12[11],xmm0[11],xmm12[12],xmm0[12],xmm12[13],xmm0[13],xmm12[14],xmm0[14],xmm12[15],xmm0[15] +; SSE2-NEXT: pmullw %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] +; SSE2-NEXT: pmullw %xmm12, %xmm8 +; SSE2-NEXT: pmullw %xmm10, %xmm8 +; SSE2-NEXT: pmullw %xmm9, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: pmullw %xmm6, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: pmullw %xmm2, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: pmullw %xmm7, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: pmullw %xmm3, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pmullw %xmm5, %xmm1 +; SSE2-NEXT: pmullw %xmm4, %xmm1 +; SSE2-NEXT: pmullw %xmm8, %xmm1 +; SSE2-NEXT: pmullw %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: pmullw %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: packuswb %xmm0, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v128i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE41-NEXT: pmullw %xmm6, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE41-NEXT: pmullw %xmm2, %xmm4 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm11 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE41-NEXT: pmullw %xmm7, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; SSE41-NEXT: pmullw %xmm3, %xmm5 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE41-NEXT: pmullw %xmm5, %xmm1 +; SSE41-NEXT: pmullw %xmm4, %xmm1 +; SSE41-NEXT: pmullw %xmm7, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm5, %xmm3 +; SSE41-NEXT: pmullw %xmm11, %xmm6 +; SSE41-NEXT: pshufb %xmm5, %xmm6 +; SSE41-NEXT: pmullw %xmm10, %xmm2 +; SSE41-NEXT: pshufb %xmm5, %xmm2 +; SSE41-NEXT: pmullw %xmm8, %xmm9 +; SSE41-NEXT: pshufb %xmm5, %xmm9 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: pmullw %xmm7, %xmm2 +; SSE41-NEXT: pshufb %xmm5, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: pmullw %xmm6, %xmm3 +; SSE41-NEXT: pshufb %xmm5, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: pmullw %xmm3, %xmm2 +; SSE41-NEXT: pshufb %xmm5, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: pmullw %xmm1, %xmm2 +; SSE41-NEXT: pmullw %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: packuswb %xmm0, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE41-NEXT: pmullw %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: packuswb %xmm0, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; SSE41-NEXT: pmullw %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: packuswb %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pmullw %xmm2, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v128i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm8 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm11 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; AVX1-NEXT: vpmullw %xmm7, %xmm5, %xmm10 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] +; AVX1-NEXT: vpmullw %xmm10, %xmm5, %xmm10 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX1-NEXT: vpmullw %xmm10, %xmm6, %xmm6 +; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm6 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpmullw %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm4 +; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v128i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v128i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vpackuswb %zmm4, %zmm1, %zmm1 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] +; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BW-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm1 +; AVX512BW-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: test_v128i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm3, %zmm2 +; AVX512BWVL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm4 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm1 +; AVX512BWVL-NEXT: vpackuswb %zmm4, %zmm1, %zmm1 +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] +; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm2, %zmm2 +; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512DQ-LABEL: test_v128i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512DQ-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax +; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512DQVL-LABEL: test_v128i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512DQVL-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> %a0) + ret i8 %1 +} + +declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-widen.ll b/llvm/test/CodeGen/X86/vector-reduce-or-widen.ll new file mode 100644 index 00000000000..720b47371e3 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-reduce-or-widen.ll @@ -0,0 +1,1168 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 + +; +; vXi64 +; + +define i64 @test_v2i64(<2 x i64> %a0) { +; SSE-LABEL: test_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v4i64(<4 x i64> %a0) { +; SSE-LABEL: test_v4i64: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v8i64(<8 x i64> %a0) { +; SSE-LABEL: test_v8i64: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v8i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v16i64(<16 x i64> %a0) { +; SSE-LABEL: test_v16i64: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v16i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> %a0) + ret i64 %1 +} + +; +; vXi32 +; + +define i32 @test_v2i32(<2 x i32> %a0) { +; SSE-LABEL: test_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v4i32(<4 x i32> %a0) { +; SSE-LABEL: test_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v8i32(<8 x i32> %a0) { +; SSE-LABEL: test_v8i32: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v16i32(<16 x i32> %a0) { +; SSE-LABEL: test_v16i32: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v16i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v32i32(<32 x i32> %a0) { +; SSE-LABEL: test_v32i32: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v32i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> %a0) + ret i32 %1 +} + +; +; vXi16 +; + +define i16 @test_v2i16(<2 x i16> %a0) { +; SSE-LABEL: test_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v4i16(<4 x i16> %a0) { +; SSE-LABEL: test_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v8i16(<8 x i16> %a0) { +; SSE-LABEL: test_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v16i16(<16 x i16> %a0) { +; SSE-LABEL: test_v16i16: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v32i16(<32 x i16> %a0) { +; SSE-LABEL: test_v32i16: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v32i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v64i16(<64 x i16> %a0) { +; SSE-LABEL: test_v64i16: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v64i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v64i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> %a0) + ret i16 %1 +} + +; +; vXi8 +; + +define i8 @test_v2i8(<2 x i8> %a0) { +; SSE2-LABEL: test_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v4i8(<4 x i8> %a0) { +; SSE2-LABEL: test_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v8i8(<8 x i8> %a0) { +; SSE2-LABEL: test_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v16i8(<16 x i8> %a0) { +; SSE2-LABEL: test_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v32i8(<32 x i8> %a0) { +; SSE2-LABEL: test_v32i8: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v64i8(<64 x i8> %a0) { +; SSE2-LABEL: test_v64i8: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v64i8: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v64i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v128i8(<128 x i8> %a0) { +; SSE2-LABEL: test_v128i8: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v128i8: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm6, %xmm2 +; SSE41-NEXT: por %xmm7, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm3 +; SSE41-NEXT: por %xmm1, %xmm3 +; SSE41-NEXT: por %xmm4, %xmm2 +; SSE41-NEXT: por %xmm3, %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v128i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v128i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v128i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> %a0) + ret i8 %1 +} + +declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax-widen.ll b/llvm/test/CodeGen/X86/vector-reduce-smax-widen.ll new file mode 100644 index 00000000000..f3cbdfccc61 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-reduce-smax-widen.ll @@ -0,0 +1,2001 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL + +; +; vXi64 +; + +define i64 @test_v2i64(<2 x i64> %a0) { +; SSE2-LABEL: test_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq +; +; AVX512BW-LABEL: test_v2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v4i64(<4 x i64> %a0) { +; SSE2-LABEL: test_v4i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v8i64(<8 x i64> %a0) { +; SSE2-LABEL: test_v8i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm7 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: xorpd %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: xorpd %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE41-NEXT: movdqa %xmm3, %xmm2 +; SSE41-NEXT: pxor %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v8i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v8i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v16i64(<16 x i64> %a0) { +; SSE2-LABEL: test_v16i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm9 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm9, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm10 +; SSE41-NEXT: movdqa %xmm10, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: pxor %xmm9, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: pxor %xmm9, %xmm1 +; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: pxor %xmm9, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm9, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 +; SSE41-NEXT: movapd %xmm6, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm1 +; SSE41-NEXT: xorpd %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 +; SSE41-NEXT: movapd %xmm7, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm1 +; SSE41-NEXT: xorpd %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 +; SSE41-NEXT: movapd %xmm7, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm6, %xmm1 +; SSE41-NEXT: xorpd %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE41-NEXT: movdqa %xmm7, %xmm2 +; SSE41-NEXT: pxor %xmm9, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm9 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v16i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm11 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm11, %xmm5, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm9 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm10 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vblendvpd %xmm10, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm7, %xmm3 +; AVX1-NEXT: vblendvpd %xmm8, %xmm5, %xmm11, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v16i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v16i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> %a0) + ret i64 %1 +} + +; +; vXi32 +; + +define i32 @test_v2i32(<2 x i32> %a0) { +; SSE2-LABEL: test_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmaxsd %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v4i32(<4 x i32> %a0) { +; SSE2-LABEL: test_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pmaxsd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: pmaxsd %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v8i32(<8 x i32> %a0) { +; SSE2-LABEL: test_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxsd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pmaxsd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: pmaxsd %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v16i32(<16 x i32> %a0) { +; SSE2-LABEL: test_v16i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxsd %xmm3, %xmm1 +; SSE41-NEXT: pmaxsd %xmm2, %xmm1 +; SSE41-NEXT: pmaxsd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmaxsd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmaxsd %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v16i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v32i32(<32 x i32> %a0) { +; SSE2-LABEL: test_v32i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm8 +; SSE2-NEXT: por %xmm2, %xmm8 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxsd %xmm6, %xmm2 +; SSE41-NEXT: pmaxsd %xmm7, %xmm3 +; SSE41-NEXT: pmaxsd %xmm5, %xmm3 +; SSE41-NEXT: pmaxsd %xmm1, %xmm3 +; SSE41-NEXT: pmaxsd %xmm4, %xmm2 +; SSE41-NEXT: pmaxsd %xmm3, %xmm2 +; SSE41-NEXT: pmaxsd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pmaxsd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmaxsd %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmaxsd %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpmaxsd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaxsd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> %a0) + ret i32 %1 +} + +; +; vXi16 +; + +define i16 @test_v2i16(<2 x i16> %a0) { +; SSE-LABEL: test_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pmaxsw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v4i16(<4 x i16> %a0) { +; SSE-LABEL: test_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pmaxsw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pmaxsw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v8i16(<8 x i16> %a0) { +; SSE2-LABEL: test_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE41-NEXT: phminposuw %xmm0, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: xorl $32767, %eax # imm = 0x7FFF +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vphminposuw %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v16i16(<16 x i16> %a0) { +; SSE2-LABEL: test_v16i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxsw %xmm1, %xmm0 +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE41-NEXT: phminposuw %xmm0, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: xorl $32767, %eax # imm = 0x7FFF +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v32i16(<32 x i16> %a0) { +; SSE2-LABEL: test_v32i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pmaxsw %xmm3, %xmm1 +; SSE2-NEXT: pmaxsw %xmm2, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxsw %xmm3, %xmm1 +; SSE41-NEXT: pmaxsw %xmm2, %xmm1 +; SSE41-NEXT: pmaxsw %xmm0, %xmm1 +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm1 +; SSE41-NEXT: phminposuw %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: xorl $32767, %eax # imm = 0x7FFF +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaxsw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v64i16(<64 x i16> %a0) { +; SSE2-LABEL: test_v64i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pmaxsw %xmm6, %xmm2 +; SSE2-NEXT: pmaxsw %xmm7, %xmm3 +; SSE2-NEXT: pmaxsw %xmm5, %xmm3 +; SSE2-NEXT: pmaxsw %xmm1, %xmm3 +; SSE2-NEXT: pmaxsw %xmm4, %xmm2 +; SSE2-NEXT: pmaxsw %xmm3, %xmm2 +; SSE2-NEXT: pmaxsw %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pmaxsw %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v64i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxsw %xmm7, %xmm3 +; SSE41-NEXT: pmaxsw %xmm5, %xmm3 +; SSE41-NEXT: pmaxsw %xmm1, %xmm3 +; SSE41-NEXT: pmaxsw %xmm6, %xmm2 +; SSE41-NEXT: pmaxsw %xmm4, %xmm2 +; SSE41-NEXT: pmaxsw %xmm3, %xmm2 +; SSE41-NEXT: pmaxsw %xmm0, %xmm2 +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm2 +; SSE41-NEXT: phminposuw %xmm2, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: xorl $32767, %eax # imm = 0x7FFF +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v64i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmaxsw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmaxsw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaxsw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxsw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v64i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> %a0) + ret i16 %1 +} + +; +; vXi8 +; + +define i8 @test_v2i8(<2 x i8> %a0) { +; SSE2-LABEL: test_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pmaxsb %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v4i8(<4 x i8> %a0) { +; SSE2-LABEL: test_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pmaxsb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pmaxsb %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v8i8(<8 x i8> %a0) { +; SSE2-LABEL: test_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmaxsb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pmaxsb %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pmaxsb %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v16i8(<16 x i8> %a0) { +; SSE2-LABEL: test_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: phminposuw %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: xorb $127, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vphminposuw %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: xorb $127, %al +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: xorb $127, %al +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v32i8(<32 x i8> %a0) { +; SSE2-LABEL: test_v32i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxsb %xmm1, %xmm0 +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: phminposuw %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: xorb $127, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: xorb $127, %al +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: xorb $127, %al +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: xorb $127, %al +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v64i8(<64 x i8> %a0) { +; SSE2-LABEL: test_v64i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtb %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v64i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxsb %xmm3, %xmm1 +; SSE41-NEXT: pmaxsb %xmm2, %xmm1 +; SSE41-NEXT: pmaxsb %xmm0, %xmm1 +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pminub %xmm1, %xmm0 +; SSE41-NEXT: phminposuw %xmm0, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: xorb $127, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v64i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaxsb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: xorb $127, %al +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: xorb $127, %al +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: xorb $127, %al +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v128i8(<128 x i8> %a0) { +; SSE2-LABEL: test_v128i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm8 +; SSE2-NEXT: por %xmm2, %xmm8 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtb %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v128i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxsb %xmm7, %xmm3 +; SSE41-NEXT: pmaxsb %xmm5, %xmm3 +; SSE41-NEXT: pmaxsb %xmm1, %xmm3 +; SSE41-NEXT: pmaxsb %xmm6, %xmm2 +; SSE41-NEXT: pmaxsb %xmm4, %xmm2 +; SSE41-NEXT: pmaxsb %xmm3, %xmm2 +; SSE41-NEXT: pmaxsb %xmm0, %xmm2 +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pminub %xmm2, %xmm0 +; SSE41-NEXT: phminposuw %xmm0, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: xorb $127, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v128i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmaxsb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmaxsb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: xorb $127, %al +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v128i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaxsb %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxsb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: xorb $127, %al +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v128i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: xorb $127, %al +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> %a0) + ret i8 %1 +} + +declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin-widen.ll b/llvm/test/CodeGen/X86/vector-reduce-smin-widen.ll new file mode 100644 index 00000000000..f1d77f3b99e --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-reduce-smin-widen.ll @@ -0,0 +1,1999 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL + +; +; vXi64 +; + +define i64 @test_v2i64(<2 x i64> %a0) { +; SSE2-LABEL: test_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq +; +; AVX512BW-LABEL: test_v2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v4i64(<4 x i64> %a0) { +; SSE2-LABEL: test_v4i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v8i64(<8 x i64> %a0) { +; SSE2-LABEL: test_v8i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm5, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: xorpd %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm1 +; SSE41-NEXT: xorpd %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v8i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v8i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v16i64(<16 x i64> %a0) { +; SSE2-LABEL: test_v16i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm9 +; SSE2-NEXT: por %xmm2, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm9, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm10 +; SSE41-NEXT: movdqa %xmm10, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: pxor %xmm9, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm2 +; SSE41-NEXT: pxor %xmm9, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm9, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: xorpd %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm6, %xmm1 +; SSE41-NEXT: xorpd %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 +; SSE41-NEXT: movapd %xmm6, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: xorpd %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm9 +; SSE41-NEXT: movdqa %xmm9, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v16i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm11 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm11, %xmm7, %xmm10 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vblendvpd %xmm10, %xmm11, %xmm7, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vblendvpd %xmm8, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v16i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v16i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> %a0) + ret i64 %1 +} + +; +; vXi32 +; + +define i32 @test_v2i32(<2 x i32> %a0) { +; SSE2-LABEL: test_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pminsd %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v4i32(<4 x i32> %a0) { +; SSE2-LABEL: test_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pminsd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: pminsd %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v8i32(<8 x i32> %a0) { +; SSE2-LABEL: test_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pminsd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pminsd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: pminsd %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v16i32(<16 x i32> %a0) { +; SSE2-LABEL: test_v16i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pminsd %xmm3, %xmm1 +; SSE41-NEXT: pminsd %xmm2, %xmm1 +; SSE41-NEXT: pminsd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pminsd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pminsd %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v16i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v32i32(<32 x i32> %a0) { +; SSE2-LABEL: test_v32i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm8 +; SSE2-NEXT: por %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm8 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pminsd %xmm6, %xmm2 +; SSE41-NEXT: pminsd %xmm7, %xmm3 +; SSE41-NEXT: pminsd %xmm5, %xmm3 +; SSE41-NEXT: pminsd %xmm1, %xmm3 +; SSE41-NEXT: pminsd %xmm4, %xmm2 +; SSE41-NEXT: pminsd %xmm3, %xmm2 +; SSE41-NEXT: pminsd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pminsd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pminsd %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpminsd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpminsd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpminsd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpminsd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminsd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpminsd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> %a0) + ret i32 %1 +} + +; +; vXi16 +; + +define i16 @test_v2i16(<2 x i16> %a0) { +; SSE-LABEL: test_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pminsw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v4i16(<4 x i16> %a0) { +; SSE-LABEL: test_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pminsw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pminsw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v8i16(<8 x i16> %a0) { +; SSE2-LABEL: test_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE41-NEXT: phminposuw %xmm0, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: xorl $32768, %eax # imm = 0x8000 +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vphminposuw %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v16i16(<16 x i16> %a0) { +; SSE2-LABEL: test_v16i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pminsw %xmm1, %xmm0 +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE41-NEXT: phminposuw %xmm0, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: xorl $32768, %eax # imm = 0x8000 +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v32i16(<32 x i16> %a0) { +; SSE2-LABEL: test_v32i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pminsw %xmm3, %xmm1 +; SSE2-NEXT: pminsw %xmm2, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pminsw %xmm3, %xmm1 +; SSE41-NEXT: pminsw %xmm2, %xmm1 +; SSE41-NEXT: pminsw %xmm0, %xmm1 +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm1 +; SSE41-NEXT: phminposuw %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: xorl $32768, %eax # imm = 0x8000 +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpminsw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v64i16(<64 x i16> %a0) { +; SSE2-LABEL: test_v64i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pminsw %xmm6, %xmm2 +; SSE2-NEXT: pminsw %xmm7, %xmm3 +; SSE2-NEXT: pminsw %xmm5, %xmm3 +; SSE2-NEXT: pminsw %xmm1, %xmm3 +; SSE2-NEXT: pminsw %xmm4, %xmm2 +; SSE2-NEXT: pminsw %xmm3, %xmm2 +; SSE2-NEXT: pminsw %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pminsw %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v64i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pminsw %xmm7, %xmm3 +; SSE41-NEXT: pminsw %xmm5, %xmm3 +; SSE41-NEXT: pminsw %xmm1, %xmm3 +; SSE41-NEXT: pminsw %xmm6, %xmm2 +; SSE41-NEXT: pminsw %xmm4, %xmm2 +; SSE41-NEXT: pminsw %xmm3, %xmm2 +; SSE41-NEXT: pminsw %xmm0, %xmm2 +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm2 +; SSE41-NEXT: phminposuw %xmm2, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: xorl $32768, %eax # imm = 0x8000 +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v64i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpminsw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpminsw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpminsw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminsw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpminsw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v64i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> %a0) + ret i16 %1 +} + +; +; vXi8 +; + +define i8 @test_v2i8(<2 x i8> %a0) { +; SSE2-LABEL: test_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pminsb %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v4i8(<4 x i8> %a0) { +; SSE2-LABEL: test_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pminsb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pminsb %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v8i8(<8 x i8> %a0) { +; SSE2-LABEL: test_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pminsb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pminsb %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pminsb %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v16i8(<16 x i8> %a0) { +; SSE2-LABEL: test_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: phminposuw %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: xorb $-128, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vphminposuw %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: xorb $-128, %al +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: xorb $-128, %al +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v32i8(<32 x i8> %a0) { +; SSE2-LABEL: test_v32i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pminsb %xmm1, %xmm0 +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: phminposuw %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: xorb $-128, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: xorb $-128, %al +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: xorb $-128, %al +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: xorb $-128, %al +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v64i8(<64 x i8> %a0) { +; SSE2-LABEL: test_v64i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v64i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pminsb %xmm3, %xmm1 +; SSE41-NEXT: pminsb %xmm2, %xmm1 +; SSE41-NEXT: pminsb %xmm0, %xmm1 +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pminub %xmm1, %xmm0 +; SSE41-NEXT: phminposuw %xmm0, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: xorb $-128, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v64i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpminsb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: xorb $-128, %al +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: xorb $-128, %al +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: xorb $-128, %al +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v128i8(<128 x i8> %a0) { +; SSE2-LABEL: test_v128i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm8 +; SSE2-NEXT: por %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtb %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm8 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v128i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pminsb %xmm7, %xmm3 +; SSE41-NEXT: pminsb %xmm5, %xmm3 +; SSE41-NEXT: pminsb %xmm1, %xmm3 +; SSE41-NEXT: pminsb %xmm6, %xmm2 +; SSE41-NEXT: pminsb %xmm4, %xmm2 +; SSE41-NEXT: pminsb %xmm3, %xmm2 +; SSE41-NEXT: pminsb %xmm0, %xmm2 +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pminub %xmm2, %xmm0 +; SSE41-NEXT: phminposuw %xmm0, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: xorb $-128, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v128i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpminsb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpminsb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpminsb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: xorb $-128, %al +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v128i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminsb %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpminsb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: xorb $-128, %al +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v128i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: xorb $-128, %al +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> %a0) + ret i8 %1 +} + +declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax-widen.ll b/llvm/test/CodeGen/X86/vector-reduce-umax-widen.ll new file mode 100644 index 00000000000..371d85a7def --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-reduce-umax-widen.ll @@ -0,0 +1,2203 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL + +; +; vXi64 +; + +define i64 @test_v2i64(<2 x i64> %a0) { +; SSE2-LABEL: test_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq +; +; AVX512BW-LABEL: test_v2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v4i64(<4 x i64> %a0) { +; SSE2-LABEL: test_v4i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v8i64(<8 x i64> %a0) { +; SSE2-LABEL: test_v8i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm7 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: xorpd %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: xorpd %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE41-NEXT: movdqa %xmm3, %xmm2 +; SSE41-NEXT: pxor %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v8i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpxor %xmm2, %xmm6, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vxorpd %xmm2, %xmm4, %xmm5 +; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v8i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v16i64(<16 x i64> %a0) { +; SSE2-LABEL: test_v16i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm9 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm9, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm10 +; SSE41-NEXT: movdqa %xmm10, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: pxor %xmm9, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: pxor %xmm9, %xmm1 +; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: pxor %xmm9, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm9, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 +; SSE41-NEXT: movapd %xmm6, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm1 +; SSE41-NEXT: xorpd %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 +; SSE41-NEXT: movapd %xmm7, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm1 +; SSE41-NEXT: xorpd %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 +; SSE41-NEXT: movapd %xmm7, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm6, %xmm1 +; SSE41-NEXT: xorpd %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE41-NEXT: movdqa %xmm7, %xmm2 +; SSE41-NEXT: pxor %xmm9, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm9 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v16i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm4, %xmm8, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10 +; AVX1-NEXT: vpxor %xmm4, %xmm10, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm11 +; AVX1-NEXT: vpxor %xmm4, %xmm11, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm12 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm13 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm6 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 +; AVX1-NEXT: vblendvpd %xmm13, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm12, %xmm7, %xmm11, %xmm3 +; AVX1-NEXT: vxorpd %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vblendvpd %xmm9, %xmm10, %xmm8, %xmm6 +; AVX1-NEXT: vxorpd %xmm4, %xmm6, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vxorpd %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 +; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm5 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm6 +; AVX2-NEXT: vpcmpgtq %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm2 +; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vxorpd %ymm4, %ymm1, %ymm2 +; AVX2-NEXT: vxorpd %ymm4, %ymm0, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm2 +; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm3 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 +; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v16i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v16i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> %a0) + ret i64 %1 +} + +; +; vXi32 +; + +define i32 @test_v2i32(<2 x i32> %a0) { +; SSE2-LABEL: test_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmaxud %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v4i32(<4 x i32> %a0) { +; SSE2-LABEL: test_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pmaxud %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: pmaxud %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v8i32(<8 x i32> %a0) { +; SSE2-LABEL: test_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxud %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pmaxud %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: pmaxud %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v16i32(<16 x i32> %a0) { +; SSE2-LABEL: test_v16i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxud %xmm3, %xmm1 +; SSE41-NEXT: pmaxud %xmm2, %xmm1 +; SSE41-NEXT: pmaxud %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmaxud %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmaxud %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v16i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v32i32(<32 x i32> %a0) { +; SSE2-LABEL: test_v32i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm9 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm9, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxud %xmm6, %xmm2 +; SSE41-NEXT: pmaxud %xmm7, %xmm3 +; SSE41-NEXT: pmaxud %xmm5, %xmm3 +; SSE41-NEXT: pmaxud %xmm1, %xmm3 +; SSE41-NEXT: pmaxud %xmm4, %xmm2 +; SSE41-NEXT: pmaxud %xmm3, %xmm2 +; SSE41-NEXT: pmaxud %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pmaxud %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmaxud %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpmaxud %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmaxud %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpmaxud %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxud %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaxud %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxud %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> %a0) + ret i32 %1 +} + +; +; vXi16 +; + +define i16 @test_v2i16(<2 x i16> %a0) { +; SSE2-LABEL: test_v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pmaxuw %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v4i16(<4 x i16> %a0) { +; SSE2-LABEL: test_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmaxuw %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pmaxuw %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v8i16(<8 x i16> %a0) { +; SSE2-LABEL: test_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: phminposuw %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: notl %eax +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vphminposuw %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: notl %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; +; AVX512BW-LABEL: test_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: notl %eax +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: notl %eax +; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512VL-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v16i16(<16 x i16> %a0) { +; SSE2-LABEL: test_v16i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxuw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: phminposuw %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: notl %eax +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v16i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: notl %eax +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v16i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: notl %eax +; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v32i16(<32 x i16> %a0) { +; SSE2-LABEL: test_v32i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pmaxsw %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pmaxsw %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pmaxsw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxuw %xmm3, %xmm1 +; SSE41-NEXT: pmaxuw %xmm2, %xmm1 +; SSE41-NEXT: pmaxuw %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: phminposuw %xmm0, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: notl %eax +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v32i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: notl %eax +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v32i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: notl %eax +; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v64i16(<64 x i16> %a0) { +; SSE2-LABEL: test_v64i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: pmaxsw %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pmaxsw %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pmaxsw %xmm7, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pmaxsw %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pmaxsw %xmm5, %xmm1 +; SSE2-NEXT: pmaxsw %xmm4, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v64i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxuw %xmm7, %xmm3 +; SSE41-NEXT: pmaxuw %xmm5, %xmm3 +; SSE41-NEXT: pmaxuw %xmm1, %xmm3 +; SSE41-NEXT: pmaxuw %xmm6, %xmm2 +; SSE41-NEXT: pmaxuw %xmm4, %xmm2 +; SSE41-NEXT: pmaxuw %xmm3, %xmm2 +; SSE41-NEXT: pmaxuw %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: phminposuw %xmm0, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: notl %eax +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v64i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmaxuw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxuw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmaxuw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaxuw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxuw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v64i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: notl %eax +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v64i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: notl %eax +; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> %a0) + ret i16 %1 +} + +; +; vXi8 +; + +define i8 @test_v2i8(<2 x i8> %a0) { +; SSE2-LABEL: test_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pmaxub %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v4i8(<4 x i8> %a0) { +; SSE2-LABEL: test_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pmaxub %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pmaxub %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v8i8(<8 x i8> %a0) { +; SSE2-LABEL: test_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmaxub %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pmaxub %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pmaxub %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v16i8(<16 x i8> %a0) { +; SSE2-LABEL: test_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pminub %xmm1, %xmm0 +; SSE41-NEXT: phminposuw %xmm0, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: notb %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vphminposuw %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: notb %al +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512BW-LABEL: test_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BW-NEXT: notb %al +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512VL-NEXT: notb %al +; AVX512VL-NEXT: # kill: def $al killed $al killed $eax +; AVX512VL-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v32i8(<32 x i8> %a0) { +; SSE2-LABEL: test_v32i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxub %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pminub %xmm1, %xmm0 +; SSE41-NEXT: phminposuw %xmm0, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: notb %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: notb %al +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: notb %al +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BW-NEXT: notb %al +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v32i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512VL-NEXT: notb %al +; AVX512VL-NEXT: # kill: def $al killed $al killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v64i8(<64 x i8> %a0) { +; SSE2-LABEL: test_v64i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pmaxub %xmm3, %xmm1 +; SSE2-NEXT: pmaxub %xmm2, %xmm1 +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v64i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxub %xmm3, %xmm1 +; SSE41-NEXT: pmaxub %xmm2, %xmm1 +; SSE41-NEXT: pmaxub %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: phminposuw %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: notb %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v64i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: notb %al +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: notb %al +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v64i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BW-NEXT: notb %al +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v64i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512VL-NEXT: notb %al +; AVX512VL-NEXT: # kill: def $al killed $al killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v128i8(<128 x i8> %a0) { +; SSE2-LABEL: test_v128i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pmaxub %xmm6, %xmm2 +; SSE2-NEXT: pmaxub %xmm7, %xmm3 +; SSE2-NEXT: pmaxub %xmm5, %xmm3 +; SSE2-NEXT: pmaxub %xmm1, %xmm3 +; SSE2-NEXT: pmaxub %xmm4, %xmm2 +; SSE2-NEXT: pmaxub %xmm3, %xmm2 +; SSE2-NEXT: pmaxub %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pmaxub %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v128i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxub %xmm7, %xmm3 +; SSE41-NEXT: pmaxub %xmm5, %xmm3 +; SSE41-NEXT: pmaxub %xmm1, %xmm3 +; SSE41-NEXT: pmaxub %xmm6, %xmm2 +; SSE41-NEXT: pmaxub %xmm4, %xmm2 +; SSE41-NEXT: pmaxub %xmm3, %xmm2 +; SSE41-NEXT: pmaxub %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: phminposuw %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: notb %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v128i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmaxub %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxub %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmaxub %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: notb %al +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v128i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaxub %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: notb %al +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v128i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BW-NEXT: notb %al +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v128i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512VL-NEXT: notb %al +; AVX512VL-NEXT: # kill: def $al killed $al killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> %a0) + ret i8 %1 +} + +declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin-widen.ll b/llvm/test/CodeGen/X86/vector-reduce-umin-widen.ll new file mode 100644 index 00000000000..f5cc88a7d6c --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-reduce-umin-widen.ll @@ -0,0 +1,2007 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL + +; +; vXi64 +; + +define i64 @test_v2i64(<2 x i64> %a0) { +; SSE2-LABEL: test_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq +; +; AVX512BW-LABEL: test_v2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v4i64(<4 x i64> %a0) { +; SSE2-LABEL: test_v4i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v8i64(<8 x i64> %a0) { +; SSE2-LABEL: test_v8i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm5, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: xorpd %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm1 +; SSE41-NEXT: xorpd %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v8i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm6 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm1 +; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 +; AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v8i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v16i64(<16 x i64> %a0) { +; SSE2-LABEL: test_v16i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm9 +; SSE2-NEXT: por %xmm2, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm9, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm10 +; SSE41-NEXT: movdqa %xmm10, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: pxor %xmm9, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm2 +; SSE41-NEXT: pxor %xmm9, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm9, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: xorpd %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm6, %xmm1 +; SSE41-NEXT: xorpd %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 +; SSE41-NEXT: movapd %xmm6, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: xorpd %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm9 +; SSE41-NEXT: movdqa %xmm9, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v16i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm8 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm6 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 +; AVX1-NEXT: vpxor %xmm4, %xmm12, %xmm10 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm13 +; AVX1-NEXT: vpxor %xmm4, %xmm13, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm10, %xmm5, %xmm10 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm11 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm11, %xmm6, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vxorpd %xmm4, %xmm5, %xmm11 +; AVX1-NEXT: vblendvpd %xmm10, %xmm12, %xmm13, %xmm7 +; AVX1-NEXT: vxorpd %xmm4, %xmm7, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm11, %xmm6, %xmm6 +; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm8, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm1 +; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm7, %xmm2 +; AVX1-NEXT: vxorpd %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 +; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm6 +; AVX2-NEXT: vpcmpgtq %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm3 +; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm5 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vxorpd %ymm4, %ymm0, %ymm2 +; AVX2-NEXT: vxorpd %ymm4, %ymm1, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 +; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 +; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v16i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v16i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> %a0) + ret i64 %1 +} + +; +; vXi32 +; + +define i32 @test_v2i32(<2 x i32> %a0) { +; SSE2-LABEL: test_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pminud %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v4i32(<4 x i32> %a0) { +; SSE2-LABEL: test_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pminud %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: pminud %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v8i32(<8 x i32> %a0) { +; SSE2-LABEL: test_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pminud %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pminud %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: pminud %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v16i32(<16 x i32> %a0) { +; SSE2-LABEL: test_v16i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm6, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: movd %xmm4, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pminud %xmm3, %xmm1 +; SSE41-NEXT: pminud %xmm2, %xmm1 +; SSE41-NEXT: pminud %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pminud %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pminud %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v16i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v32i32(<32 x i32> %a0) { +; SSE2-LABEL: test_v32i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm9 +; SSE2-NEXT: por %xmm2, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm9, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm8 +; SSE2-NEXT: por %xmm3, %xmm8 +; SSE2-NEXT: movd %xmm8, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pminud %xmm6, %xmm2 +; SSE41-NEXT: pminud %xmm7, %xmm3 +; SSE41-NEXT: pminud %xmm5, %xmm3 +; SSE41-NEXT: pminud %xmm1, %xmm3 +; SSE41-NEXT: pminud %xmm4, %xmm2 +; SSE41-NEXT: pminud %xmm3, %xmm2 +; SSE41-NEXT: pminud %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pminud %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pminud %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpminud %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> %a0) + ret i32 %1 +} + +; +; vXi16 +; + +define i16 @test_v2i16(<2 x i16> %a0) { +; SSE2-LABEL: test_v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pminuw %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v4i16(<4 x i16> %a0) { +; SSE2-LABEL: test_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pminuw %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pminuw %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v8i16(<8 x i16> %a0) { +; SSE2-LABEL: test_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: phminposuw %xmm0, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vphminposuw %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v16i16(<16 x i16> %a0) { +; SSE2-LABEL: test_v16i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pminuw %xmm1, %xmm0 +; SSE41-NEXT: phminposuw %xmm0, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v32i16(<32 x i16> %a0) { +; SSE2-LABEL: test_v32i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pminsw %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pminsw %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pminsw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pminuw %xmm3, %xmm1 +; SSE41-NEXT: pminuw %xmm2, %xmm1 +; SSE41-NEXT: pminuw %xmm0, %xmm1 +; SSE41-NEXT: phminposuw %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v64i16(<64 x i16> %a0) { +; SSE2-LABEL: test_v64i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: pminsw %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pminsw %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pminsw %xmm7, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pminsw %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pminsw %xmm5, %xmm1 +; SSE2-NEXT: pminsw %xmm4, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v64i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pminuw %xmm7, %xmm3 +; SSE41-NEXT: pminuw %xmm5, %xmm3 +; SSE41-NEXT: pminuw %xmm1, %xmm3 +; SSE41-NEXT: pminuw %xmm6, %xmm2 +; SSE41-NEXT: pminuw %xmm4, %xmm2 +; SSE41-NEXT: pminuw %xmm3, %xmm2 +; SSE41-NEXT: pminuw %xmm0, %xmm2 +; SSE41-NEXT: phminposuw %xmm2, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: # kill: def $ax killed $ax killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v64i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpminuw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpminuw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminuw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpminuw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v64i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> %a0) + ret i16 %1 +} + +; +; vXi8 +; + +define i8 @test_v2i8(<2 x i8> %a0) { +; SSE2-LABEL: test_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v4i8(<4 x i8> %a0) { +; SSE2-LABEL: test_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pminub %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pminub %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v8i8(<8 x i8> %a0) { +; SSE2-LABEL: test_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pminub %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pminub %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v16i8(<16 x i8> %a0) { +; SSE2-LABEL: test_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pminub %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pminub %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: phminposuw %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vphminposuw %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v32i8(<32 x i8> %a0) { +; SSE2-LABEL: test_v32i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pminub %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pminub %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pminub %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pminub %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: phminposuw %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v64i8(<64 x i8> %a0) { +; SSE2-LABEL: test_v64i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pminub %xmm3, %xmm1 +; SSE2-NEXT: pminub %xmm2, %xmm1 +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pminub %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pminub %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v64i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pminub %xmm3, %xmm1 +; SSE41-NEXT: pminub %xmm2, %xmm1 +; SSE41-NEXT: pminub %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pminub %xmm1, %xmm0 +; SSE41-NEXT: phminposuw %xmm0, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v64i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpminub %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v128i8(<128 x i8> %a0) { +; SSE2-LABEL: test_v128i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pminub %xmm6, %xmm2 +; SSE2-NEXT: pminub %xmm7, %xmm3 +; SSE2-NEXT: pminub %xmm5, %xmm3 +; SSE2-NEXT: pminub %xmm1, %xmm3 +; SSE2-NEXT: pminub %xmm4, %xmm2 +; SSE2-NEXT: pminub %xmm3, %xmm2 +; SSE2-NEXT: pminub %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pminub %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pminub %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v128i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pminub %xmm7, %xmm3 +; SSE41-NEXT: pminub %xmm5, %xmm3 +; SSE41-NEXT: pminub %xmm1, %xmm3 +; SSE41-NEXT: pminub %xmm6, %xmm2 +; SSE41-NEXT: pminub %xmm4, %xmm2 +; SSE41-NEXT: pminub %xmm3, %xmm2 +; SSE41-NEXT: pminub %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pminub %xmm2, %xmm0 +; SSE41-NEXT: phminposuw %xmm0, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v128i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpminub %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpminub %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v128i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminub %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpminub %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v128i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> %a0) + ret i8 %1 +} + +declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-widen.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-widen.ll new file mode 100644 index 00000000000..7bd3db37c83 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-widen.ll @@ -0,0 +1,1168 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 + +; +; vXi64 +; + +define i64 @test_v2i64(<2 x i64> %a0) { +; SSE-LABEL: test_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v4i64(<4 x i64> %a0) { +; SSE-LABEL: test_v4i64: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v8i64(<8 x i64> %a0) { +; SSE-LABEL: test_v8i64: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v8i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> %a0) + ret i64 %1 +} + +define i64 @test_v16i64(<16 x i64> %a0) { +; SSE-LABEL: test_v16i64: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm6, %xmm2 +; SSE-NEXT: pxor %xmm7, %xmm3 +; SSE-NEXT: pxor %xmm5, %xmm3 +; SSE-NEXT: pxor %xmm1, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: pxor %xmm3, %xmm2 +; SSE-NEXT: pxor %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v16i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> %a0) + ret i64 %1 +} + +; +; vXi32 +; + +define i32 @test_v2i32(<2 x i32> %a0) { +; SSE-LABEL: test_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v4i32(<4 x i32> %a0) { +; SSE-LABEL: test_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v8i32(<8 x i32> %a0) { +; SSE-LABEL: test_v8i32: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v16i32(<16 x i32> %a0) { +; SSE-LABEL: test_v16i32: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v16i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> %a0) + ret i32 %1 +} + +define i32 @test_v32i32(<32 x i32> %a0) { +; SSE-LABEL: test_v32i32: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm6, %xmm2 +; SSE-NEXT: pxor %xmm7, %xmm3 +; SSE-NEXT: pxor %xmm5, %xmm3 +; SSE-NEXT: pxor %xmm1, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: pxor %xmm3, %xmm2 +; SSE-NEXT: pxor %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v32i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> %a0) + ret i32 %1 +} + +; +; vXi16 +; + +define i16 @test_v2i16(<2 x i16> %a0) { +; SSE-LABEL: test_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v4i16(<4 x i16> %a0) { +; SSE-LABEL: test_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v8i16(<8 x i16> %a0) { +; SSE-LABEL: test_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX-LABEL: test_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v16i16(<16 x i16> %a0) { +; SSE-LABEL: test_v16i16: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v32i16(<32 x i16> %a0) { +; SSE-LABEL: test_v32i16: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v32i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> %a0) + ret i16 %1 +} + +define i16 @test_v64i16(<64 x i16> %a0) { +; SSE-LABEL: test_v64i16: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm6, %xmm2 +; SSE-NEXT: pxor %xmm7, %xmm3 +; SSE-NEXT: pxor %xmm5, %xmm3 +; SSE-NEXT: pxor %xmm1, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: pxor %xmm3, %xmm2 +; SSE-NEXT: pxor %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v64i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v64i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> %a0) + ret i16 %1 +} + +; +; vXi8 +; + +define i8 @test_v2i8(<2 x i8> %a0) { +; SSE2-LABEL: test_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v4i8(<4 x i8> %a0) { +; SSE2-LABEL: test_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v8i8(<8 x i8> %a0) { +; SSE2-LABEL: test_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v16i8(<16 x i8> %a0) { +; SSE2-LABEL: test_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v32i8(<32 x i8> %a0) { +; SSE2-LABEL: test_v32i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v64i8(<64 x i8> %a0) { +; SSE2-LABEL: test_v64i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v64i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm3, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v64i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> %a0) + ret i8 %1 +} + +define i8 @test_v128i8(<128 x i8> %a0) { +; SSE2-LABEL: test_v128i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm7, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v128i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: pxor %xmm7, %xmm3 +; SSE41-NEXT: pxor %xmm5, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm2 +; SSE41-NEXT: pxor %xmm3, %xmm2 +; SSE41-NEXT: pxor %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_v128i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v128i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v128i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> %a0) + ret i8 %1 +} + +declare i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-sext-widen.ll b/llvm/test/CodeGen/X86/vector-sext-widen.ll new file mode 100644 index 00000000000..08e8b514c60 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-sext-widen.ll @@ -0,0 +1,3966 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW +; +; Just two 32-bit runs to make sure we do reasonable things there. +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE41 + +define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i8_to_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_16i8_to_8i16: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X32-SSE2-LABEL: sext_16i8_to_8i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_16i8_to_8i16: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %C = sext <8 x i8> %B to <8 x i16> + ret <8 x i16> %C +} + +define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i8_to_16i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: psraw $8, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_16i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: psraw $8, %xmm2 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSSE3-NEXT: psraw $8, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_16i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_16i8_to_16i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_16i8_to_16i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_16i8_to_16i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: sext_16i8_to_16i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X32-SSE2-NEXT: psraw $8, %xmm2 +; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; X32-SSE2-NEXT: psraw $8, %xmm1 +; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_16i8_to_16i16: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = sext <16 x i8> %A to <16 x i16> + ret <16 x i16> %B +} + +define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_32i8_to_32i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: psraw $8, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; SSE2-NEXT: psraw $8, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: psraw $8, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE2-NEXT: psraw $8, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_32i8_to_32i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSSE3-NEXT: psraw $8, %xmm4 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; SSSE3-NEXT: psraw $8, %xmm5 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: psraw $8, %xmm2 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSSE3-NEXT: psraw $8, %xmm3 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_32i8_to_32i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbw %xmm0, %xmm5 +; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxbw %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovsxbw %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_32i8_to_32i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_32i8_to_32i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sext_32i8_to_32i16: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm2 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: sext_32i8_to_32i16: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: retq +; +; X32-SSE2-LABEL: sext_32i8_to_32i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; X32-SSE2-NEXT: psraw $8, %xmm4 +; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; X32-SSE2-NEXT: psraw $8, %xmm5 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X32-SSE2-NEXT: psraw $8, %xmm2 +; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; X32-SSE2-NEXT: psraw $8, %xmm3 +; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_32i8_to_32i16: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm5 +; X32-SSE41-NEXT: pmovsxbw %xmm1, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm4 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm3 +; X32-SSE41-NEXT: movdqa %xmm5, %xmm0 +; X32-SSE41-NEXT: movdqa %xmm4, %xmm1 +; X32-SSE41-NEXT: retl +entry: + %B = sext <32 x i8> %A to <32 x i16> + ret <32 x i16> %B +} + +define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i8_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_16i8_to_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X32-SSE2-LABEL: sext_16i8_to_4i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_16i8_to_4i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %C = sext <4 x i8> %B to <4 x i32> + ret <4 x i32> %C +} + +define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i8_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: psrad $24, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_16i8_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_16i8_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_16i8_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: sext_16i8_to_8i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: psrad $24, %xmm2 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_16i8_to_8i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %C = sext <8 x i8> %B to <8 x i32> + ret <8 x i32> %C +} + +define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i8_to_16i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: psrad $24, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_16i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSSE3-NEXT: psrad $24, %xmm4 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: psrad $24, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: psrad $24, %xmm3 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_16i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxbd %xmm1, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: pmovsxbd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_16i8_to_16i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_16i8_to_16i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_16i8_to_16i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: sext_16i8_to_16i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; X32-SSE2-NEXT: psrad $24, %xmm4 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: psrad $24, %xmm2 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X32-SSE2-NEXT: psrad $24, %xmm3 +; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_16i8_to_16i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm4 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X32-SSE41-NEXT: pmovsxbd %xmm1, %xmm1 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxbd %xmm2, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm3 +; X32-SSE41-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = sext <16 x i8> %A to <16 x i32> + ret <16 x i32> %B +} + +define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i8_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_16i8_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxbq %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X32-SSE2-LABEL: sext_16i8_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_16i8_to_2i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1> + %C = sext <2 x i8> %B to <2 x i64> + ret <2 x i64> %C +} + +define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i8_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbq %xmm0, %xmm2 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pmovsxbq %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_16i8_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_16i8_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_16i8_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: sext_16i8_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_16i8_to_4i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2 +; X32-SSE41-NEXT: psrld $16, %xmm0 +; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %C = sext <4 x i8> %B to <4 x i64> + ret <4 x i64> %C +} + +define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i8_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: psrad $24, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: psrad $24, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbq %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pmovsxbq %xmm1, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxbq %xmm2, %xmm2 +; SSE41-NEXT: psrlq $48, %xmm0 +; SSE41-NEXT: pmovsxbq %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_16i8_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_16i8_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_16i8_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbq %xmm0, %zmm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: sext_16i8_to_8i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: pxor %xmm5, %xmm5 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X32-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X32-SSE2-NEXT: psrad $24, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X32-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_16i8_to_8i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm4 +; X32-SSE41-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE41-NEXT: psrld $16, %xmm1 +; X32-SSE41-NEXT: pmovsxbq %xmm1, %xmm1 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; X32-SSE41-NEXT: pmovsxbq %xmm2, %xmm2 +; X32-SSE41-NEXT: psrlq $48, %xmm0 +; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm3 +; X32-SSE41-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %C = sext <8 x i8> %B to <8 x i64> + ret <8 x i64> %C +} + +define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_8i16_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_8i16_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_8i16_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_8i16_to_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X32-SSE2-LABEL: sext_8i16_to_4i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_8i16_to_4i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %C = sext <4 x i16> %B to <4 x i32> + ret <4 x i32> %C +} + +define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_8i16_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_8i16_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_8i16_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_8i16_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_8i16_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_8i16_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: sext_8i16_to_8i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: psrad $16, %xmm2 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: psrad $16, %xmm1 +; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_8i16_to_8i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = sext <8 x i16> %A to <8 x i32> + ret <8 x i32> %B +} + +define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i16_to_16i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i16_to_16i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSSE3-NEXT: psrad $16, %xmm4 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSSE3-NEXT: psrad $16, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSSE3-NEXT: psrad $16, %xmm3 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i16_to_16i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwd %xmm0, %xmm5 +; SSE41-NEXT: pmovsxwd %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxwd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovsxwd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_16i16_to_16i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_16i16_to_16i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_16i16_to_16i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: sext_16i16_to_16i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; X32-SSE2-NEXT: psrad $16, %xmm4 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; X32-SSE2-NEXT: psrad $16, %xmm5 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X32-SSE2-NEXT: psrad $16, %xmm2 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; X32-SSE2-NEXT: psrad $16, %xmm3 +; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_16i16_to_16i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm5 +; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm4 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm3 +; X32-SSE41-NEXT: movdqa %xmm5, %xmm0 +; X32-SSE41-NEXT: movdqa %xmm4, %xmm1 +; X32-SSE41-NEXT: retl +entry: + %B = sext <16 x i16> %A to <16 x i32> + ret <16 x i32> %B +} + +define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_8i16_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_8i16_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_8i16_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_8i16_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxwq %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X32-SSE2-LABEL: sext_8i16_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_8i16_to_2i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1> + %C = sext <2 x i16> %B to <2 x i64> + ret <2 x i64> %C +} + +define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_8i16_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_8i16_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_8i16_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwq %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxwq %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_8i16_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_8i16_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_8i16_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: sext_8i16_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: psrad $16, %xmm1 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_8i16_to_4i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %C = sext <4 x i16> %B to <4 x i64> + ret <4 x i64> %C +} + +define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_8i16_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_8i16_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: psrad $16, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_8i16_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwq %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxwq %xmm1, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxwq %xmm2, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: pmovsxwq %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_8i16_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_8i16_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_8i16_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: sext_8i16_to_8i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: psrad $16, %xmm1 +; X32-SSE2-NEXT: pxor %xmm5, %xmm5 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X32-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X32-SSE2-NEXT: psrad $16, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X32-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_8i16_to_8i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm4 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X32-SSE41-NEXT: pmovsxwq %xmm1, %xmm1 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxwq %xmm2, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm3 +; X32-SSE41-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = sext <8 x i16> %A to <8 x i64> + ret <8 x i64> %B +} + +define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_4i32_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_4i32_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_4i32_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxdq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_4i32_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X32-SSE2-LABEL: sext_4i32_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_4i32_to_2i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1> + %C = sext <2 x i32> %B to <2 x i64> + ret <2 x i64> %C +} + +define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_4i32_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_4i32_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_4i32_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxdq %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_4i32_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_4i32_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_4i32_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: sext_4i32_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_4i32_to_4i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = sext <4 x i32> %A to <4 x i64> + ret <4 x i64> %B +} + +define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_8i32_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_8i32_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_8i32_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxdq %xmm0, %xmm5 +; SSE41-NEXT: pmovsxdq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxdq %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovsxdq %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_8i32_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_8i32_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_8i32_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: sext_8i32_to_8i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE2-NEXT: pxor %xmm4, %xmm4 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; X32-SSE2-NEXT: pxor %xmm5, %xmm5 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_8i32_to_8i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm5 +; X32-SSE41-NEXT: pmovsxdq %xmm1, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm4 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm3 +; X32-SSE41-NEXT: movdqa %xmm5, %xmm0 +; X32-SSE41-NEXT: movdqa %xmm4, %xmm1 +; X32-SSE41-NEXT: retl +entry: + %B = sext <8 x i32> %A to <8 x i64> + ret <8 x i64> %B +} + +define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) { +; SSE-LABEL: load_sext_2i1_to_2i64: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movzbl (%rdi), %eax +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shlq $62, %rcx +; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: shlq $63, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE-NEXT: retq +; +; AVX1-LABEL: load_sext_2i1_to_2i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $62, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: shlq $63, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_2i1_to_2i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: movzbl (%rdi), %eax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $62, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: shlq $63, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_2i1_to_2i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: kmovw (%rdi), %k1 +; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_2i1_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movzbl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $30, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; X32-SSE2-NEXT: shll $31, %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_2i1_to_2i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movzbl (%eax), %eax +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $31, %ecx +; X32-SSE41-NEXT: movd %ecx, %xmm0 +; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; X32-SSE41-NEXT: shll $30, %eax +; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0 +; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; X32-SSE41-NEXT: psrad $31, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <2 x i1>, <2 x i1>* %ptr + %Y = sext <2 x i1> %X to <2 x i64> + ret <2 x i64> %Y +} + +define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) { +; SSE2-LABEL: load_sext_2i8_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_2i8_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movzwl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_2i8_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_2i8_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_2i8_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movzwl (%eax), %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_2i8_to_2i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <2 x i8>, <2 x i8>* %ptr + %Y = sext <2 x i8> %X to <2 x i64> + ret <2 x i64> %Y +} + +define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) { +; SSE2-LABEL: load_sext_4i1_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movl (%rdi), %eax +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shlq $60, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shlq $61, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shlq $62, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: shlq $63, %rax +; SSE2-NEXT: sarq $63, %rax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i1_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movl (%rdi), %eax +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: shlq $60, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: shlq $61, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: shlq $62, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: shlq $63, %rax +; SSSE3-NEXT: sarq $63, %rax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i1_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movl (%rdi), %eax +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $62, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: movq %rax, %rdx +; SSE41-NEXT: shlq $63, %rdx +; SSE41-NEXT: sarq $63, %rdx +; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $61, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: shlq $60, %rax +; SSE41-NEXT: sarq $63, %rax +; SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_4i1_to_4i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $62, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: shlq $63, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $61, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: shlq $60, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_4i1_to_4i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $62, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: movq %rax, %rdx +; AVX2-NEXT: shlq $63, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $61, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: shlq $60, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_4i1_to_4i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: kmovw (%rdi), %k1 +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_4i1_to_4i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $28, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $29, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $30, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: shll $31, %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_4i1_to_4i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movl (%eax), %eax +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $30, %ecx +; X32-SSE41-NEXT: movl %eax, %edx +; X32-SSE41-NEXT: shll $31, %edx +; X32-SSE41-NEXT: movd %edx, %xmm0 +; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $29, %ecx +; X32-SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; X32-SSE41-NEXT: shll $28, %eax +; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; X32-SSE41-NEXT: psrad $31, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i1>, <4 x i1>* %ptr + %Y = sext <4 x i1> %X to <4 x i32> + ret <4 x i32> %Y +} + +define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) { +; SSE2-LABEL: load_sext_4i8_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i8_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i8_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_4i8_to_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_4i8_to_4i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_4i8_to_4i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbd (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i8>, <4 x i8>* %ptr + %Y = sext <4 x i8> %X to <4 x i32> + ret <4 x i32> %Y +} + +define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) { +; SSE2-LABEL: load_sext_4i1_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movl (%rdi), %eax +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $3, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $2, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: shrl %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; SSE2-NEXT: psllq $63, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] +; SSE2-NEXT: psllq $63, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i1_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movl (%rdi), %eax +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $3, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $2, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: shrl %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; SSSE3-NEXT: psllq $63, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] +; SSSE3-NEXT: psllq $63, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i1_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movl (%rdi), %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl %ecx +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $2, %ecx +; SSE41-NEXT: pinsrd $2, %ecx, %xmm1 +; SSE41-NEXT: shrl $3, %eax +; SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: psllq $63, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE41-NEXT: psllq $63, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_4i1_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $62, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: shlq $63, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $61, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: shlq $60, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_4i1_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $60, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $61, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $62, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: shlq $63, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_4i1_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: kmovw (%rdi), %k1 +; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_4i1_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movzbl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $3, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $2, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movd %eax, %xmm2 +; X32-SSE2-NEXT: shrl %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm2 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; X32-SSE2-NEXT: psllq $63, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] +; X32-SSE2-NEXT: psllq $63, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_4i1_to_4i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movzbl (%eax), %eax +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl %ecx +; X32-SSE41-NEXT: movd %eax, %xmm1 +; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $2, %ecx +; X32-SSE41-NEXT: pinsrd $2, %ecx, %xmm1 +; X32-SSE41-NEXT: shrl $3, %eax +; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; X32-SSE41-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; X32-SSE41-NEXT: psllq $63, %xmm0 +; X32-SSE41-NEXT: psrad $31, %xmm0 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; X32-SSE41-NEXT: psllq $63, %xmm1 +; X32-SSE41-NEXT: psrad $31, %xmm1 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i1>, <4 x i1>* %ptr + %Y = sext <4 x i1> %X to <4 x i64> + ret <4 x i64> %Y +} + +define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { +; SSE2-LABEL: load_sext_4i8_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i8_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i8_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 +; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_4i8_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbq (%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_4i8_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_4i8_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbq (%rdi), %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_4i8_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_4i8_to_4i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i8>, <4 x i8>* %ptr + %Y = sext <4 x i8> %X to <4 x i64> + ret <4 x i64> %Y +} + +define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) { +; SSE2-LABEL: load_sext_4i8_to_4i64_extract: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i8_to_4i64_extract: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i8_to_4i64_extract: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_4i8_to_4i64_extract: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_4i8_to_4i64_extract: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_4i8_to_4i64_extract: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbq (%rdi), %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_4i8_to_4i64_extract: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_4i8_to_4i64_extract: +; X32-SSE41: # %bb.0: +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm0 +; X32-SSE41-NEXT: retl + %ld = load <4 x i8>, <4 x i8>* %ptr + %sext = sext <4 x i8> %ld to <4 x i64> + %extract = shufflevector <4 x i64> %sext, <4 x i64> undef, <2 x i32> <i32 2, i32 3> + ret <2 x i64> %extract +} + +define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { +; SSE-LABEL: load_sext_8i1_to_8i16: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: load_sext_8i1_to_8i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_8i1_to_8i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_sext_8i1_to_8i16: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: kmovw (%rdi), %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_8i1_to_8i16: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; X32-SSE-LABEL: load_sext_8i1_to_8i16: +; X32-SSE: # %bb.0: # %entry +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqw %xmm1, %xmm0 +; X32-SSE-NEXT: retl +entry: + %X = load <8 x i1>, <8 x i1>* %ptr + %Y = sext <8 x i1> %X to <8 x i16> + ret <8 x i16> %Y +} + +define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) { +; SSE2-LABEL: load_sext_8i8_to_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_8i8_to_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_8i8_to_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_8i8_to_8i16: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_8i8_to_8i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_8i8_to_8i16: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <8 x i8>, <8 x i8>* %ptr + %Y = sext <8 x i8> %X to <8 x i16> + ret <8 x i16> %Y +} + +define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) { +; SSE2-LABEL: load_sext_8i8_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: psrad $24, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_8i8_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: psrad $24, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_8i8_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 +; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1 +; SSE41-NEXT: pmovsxbq 4(%rdi), %xmm2 +; SSE41-NEXT: pmovsxbq 6(%rdi), %xmm3 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_8i8_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbq 6(%rdi), %xmm1 +; AVX1-NEXT: vpmovsxbq 4(%rdi), %xmm2 +; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbq (%rdi), %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_8i8_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0 +; AVX2-NEXT: vpmovsxbq 4(%rdi), %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_8i8_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbq (%rdi), %zmm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_8i8_to_8i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: pxor %xmm4, %xmm4 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X32-SSE2-NEXT: psrad $24, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X32-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_8i8_to_8i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1 +; X32-SSE41-NEXT: pmovsxbq 4(%eax), %xmm2 +; X32-SSE41-NEXT: pmovsxbq 6(%eax), %xmm3 +; X32-SSE41-NEXT: retl +entry: + %X = load <8 x i8>, <8 x i8>* %ptr + %Y = sext <8 x i8> %X to <8 x i64> + ret <8 x i64> %Y +} + +define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) { +; SSE-LABEL: load_sext_8i1_to_8i32: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: load_sext_8i1_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_8i1_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_8i1_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: kmovw (%rdi), %k1 +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq +; +; X32-SSE-LABEL: load_sext_8i1_to_8i32: +; X32-SSE: # %bb.0: # %entry +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] +; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; X32-SSE-NEXT: retl +entry: + %X = load <8 x i1>, <8 x i1>* %ptr + %Y = sext <8 x i1> %X to <8 x i32> + ret <8 x i32> %Y +} + +define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) { +; SSE2-LABEL: load_sext_8i8_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_8i8_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_8i8_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 +; SSE41-NEXT: pmovsxbd 4(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_8i8_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbd (%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_8i8_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_8i8_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbd (%rdi), %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_8i8_to_8i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_8i8_to_8i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbd (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxbd 4(%eax), %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <8 x i8>, <8 x i8>* %ptr + %Y = sext <8 x i8> %X to <8 x i32> + ret <8 x i32> %Y +} + +define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { +; SSE2-LABEL: load_sext_16i1_to_16i8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_16i1_to_16i8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_16i1_to_16i8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_16i1_to_16i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [-1.7939930131212661E-307,-1.7939930131212661E-307] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_16i1_to_16i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_sext_16i1_to_16i8: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: kmovw (%rdi), %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_16i1_to_16i8: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_16i1_to_16i8: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; X32-SSE2-NEXT: pand %xmm1, %xmm0 +; X32-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_16i1_to_16i8: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; X32-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; X32-SSE41-NEXT: pand %xmm1, %xmm0 +; X32-SSE41-NEXT: pcmpeqb %xmm1, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <16 x i1>, <16 x i1>* %ptr + %Y = sext <16 x i1> %X to <16 x i8> + ret <16 x i8> %Y +} + +define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { +; SSE-LABEL: load_sext_16i1_to_16i16: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pcmpeqw %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: load_sext_16i1_to_16i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_16i1_to_16i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_sext_16i1_to_16i16: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: kmovw (%rdi), %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_16i1_to_16i16: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512BW-NEXT: retq +; +; X32-SSE-LABEL: load_sext_16i1_to_16i16: +; X32-SSE: # %bb.0: # %entry +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] +; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm1 +; X32-SSE-NEXT: retl +entry: + %X = load <16 x i1>, <16 x i1>* %ptr + %Y = sext <16 x i1> %X to <16 x i16> + ret <16 x i16> %Y +} + +define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { +; SSE-LABEL: load_sext_32i1_to_32i8: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pcmpeqb %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: load_sext_32i1_to_32i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_32i1_to_32i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_sext_32i1_to_32i8: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: kmovw (%rdi), %k1 +; AVX512F-NEXT: kmovw 2(%rdi), %k2 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_32i1_to_32i8: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: kmovd (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512BW-NEXT: retq +; +; X32-SSE-LABEL: load_sext_32i1_to_32i8: +; X32-SSE: # %bb.0: # %entry +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 +; X32-SSE-NEXT: retl +entry: + %X = load <32 x i1>, <32 x i1>* %ptr + %Y = sext <32 x i1> %X to <32 x i8> + ret <32 x i8> %Y +} + +define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) { +; SSE2-LABEL: load_sext_16i8_to_16i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_16i8_to_16i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: psraw $8, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_16i8_to_16i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 +; SSE41-NEXT: pmovsxbw 8(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_16i8_to_16i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbw 8(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbw (%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_16i8_to_16i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbw (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_16i8_to_16i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbw (%rdi), %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_16i8_to_16i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movdqa (%eax), %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-SSE2-NEXT: psraw $8, %xmm0 +; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-SSE2-NEXT: psraw $8, %xmm1 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_16i8_to_16i16: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxbw 8(%eax), %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <16 x i8>, <16 x i8>* %ptr + %Y = sext <16 x i8> %X to <16 x i16> + ret <16 x i16> %Y +} + +define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) { +; SSE2-LABEL: load_sext_2i16_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_2i16_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_2i16_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwq (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_2i16_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxwq (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_2i16_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_2i16_to_2i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <2 x i16>, <2 x i16>* %ptr + %Y = sext <2 x i16> %X to <2 x i64> + ret <2 x i64> %Y +} + +define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) { +; SSE2-LABEL: load_sext_4i16_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i16_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i16_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_4i16_to_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_4i16_to_4i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_4i16_to_4i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxwd (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i16>, <4 x i16>* %ptr + %Y = sext <4 x i16> %X to <4 x i32> + ret <4 x i32> %Y +} + +define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { +; SSE2-LABEL: load_sext_4i16_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i16_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i16_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwq (%rdi), %xmm0 +; SSE41-NEXT: pmovsxwq 4(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_4i16_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxwq 4(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwq (%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_4i16_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxwq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_4i16_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxwq (%rdi), %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_4i16_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: psrad $16, %xmm1 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_4i16_to_4i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxwq 4(%eax), %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i16>, <4 x i16>* %ptr + %Y = sext <4 x i16> %X to <4 x i64> + ret <4 x i64> %Y +} + +define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) { +; SSE2-LABEL: load_sext_8i16_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_8i16_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_8i16_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 +; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_8i16_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwd (%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_8i16_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_8i16_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxwd (%rdi), %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_8i16_to_8i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movdqa (%eax), %xmm1 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psrad $16, %xmm1 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_8i16_to_8i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxwd (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxwd 8(%eax), %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <8 x i16>, <8 x i16>* %ptr + %Y = sext <8 x i16> %X to <8 x i32> + ret <8 x i32> %Y +} + +define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) { +; SSE2-LABEL: load_sext_2i32_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_2i32_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_2i32_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxdq (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_2i32_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxdq (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_2i32_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_2i32_to_2i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxdq (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <2 x i32>, <2 x i32>* %ptr + %Y = sext <2 x i32> %X to <2 x i64> + ret <2 x i64> %Y +} + +define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) { +; SSE2-LABEL: load_sext_4i32_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i32_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i32_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxdq (%rdi), %xmm0 +; SSE41-NEXT: pmovsxdq 8(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_4i32_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxdq 8(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxdq (%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_4i32_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxdq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_4i32_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxdq (%rdi), %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: load_sext_4i32_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movdqa (%eax), %xmm0 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: load_sext_4i32_to_4i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxdq (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxdq 8(%eax), %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i32>, <4 x i32>* %ptr + %Y = sext <4 x i32> %X to <4 x i64> + ret <4 x i64> %Y +} + +define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_2i8_to_i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_2i8_to_i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_2i8_to_i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_2i8_to_i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq +; +; X32-SSE2-LABEL: sext_2i8_to_i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm0 +; X32-SSE2-NEXT: movd %xmm0, %eax +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_2i8_to_i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0 +; X32-SSE41-NEXT: movd %xmm0, %eax +; X32-SSE41-NEXT: retl +entry: + %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1> + %Ex = sext <2 x i8> %Shuf to <2 x i16> + %Bc = bitcast <2 x i16> %Ex to i32 + ret i32 %Bc +} + +define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { +; SSE2-LABEL: sext_4i1_to_4i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_4i1_to_4i64: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pslld $31, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_4i1_to_4i64: +; SSE41: # %bb.0: +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxdq %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_4i1_to_4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_4i1_to_4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_4i1_to_4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: sext_4i1_to_4i64: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: pslld $31, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_4i1_to_4i64: +; X32-SSE41: # %bb.0: +; X32-SSE41-NEXT: pslld $31, %xmm0 +; X32-SSE41-NEXT: psrad $31, %xmm0 +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl + %extmask = sext <4 x i1> %mask to <4 x i64> + ret <4 x i64> %extmask +} + +define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { +; SSE2-LABEL: sext_4i8_to_4i64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_4i8_to_4i64: +; SSSE3: # %bb.0: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_4i8_to_4i64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbq %xmm0, %xmm2 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pmovsxbq %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_4i8_to_4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_4i8_to_4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_4i8_to_4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: sext_4i8_to_4i64: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_4i8_to_4i64: +; X32-SSE41: # %bb.0: +; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2 +; X32-SSE41-NEXT: psrld $16, %xmm0 +; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl + %extmask = sext <4 x i8> %mask to <4 x i64> + ret <4 x i64> %extmask +} + +define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind { +; SSE-LABEL: sext_32xi1_to_32xi8: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqw %xmm5, %xmm1 +; SSE-NEXT: pcmpeqw %xmm4, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pcmpeqw %xmm7, %xmm3 +; SSE-NEXT: pcmpeqw %xmm6, %xmm2 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: sext_32xi1_to_32xi8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_32xi1_to_32xi8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sext_32xi1_to_32xi8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: sext_32xi1_to_32xi8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512BW-NEXT: retq +; +; X32-SSE-LABEL: sext_32xi1_to_32xi8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pushl %ebp +; X32-SSE-NEXT: movl %esp, %ebp +; X32-SSE-NEXT: andl $-16, %esp +; X32-SSE-NEXT: subl $16, %esp +; X32-SSE-NEXT: movdqa 8(%ebp), %xmm3 +; X32-SSE-NEXT: pcmpeqw 40(%ebp), %xmm1 +; X32-SSE-NEXT: pcmpeqw 24(%ebp), %xmm0 +; X32-SSE-NEXT: packsswb %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqw 72(%ebp), %xmm3 +; X32-SSE-NEXT: pcmpeqw 56(%ebp), %xmm2 +; X32-SSE-NEXT: packsswb %xmm3, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE-NEXT: movl %ebp, %esp +; X32-SSE-NEXT: popl %ebp +; X32-SSE-NEXT: retl + %a = icmp eq <32 x i16> %c1, %c2 + %b = sext <32 x i1> %a to <32 x i8> + ret <32 x i8> %b +} + +define <2 x i32> @sext_2i8_to_2i32(<2 x i8>* %addr) { +; SSE2-LABEL: sext_2i8_to_2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_2i8_to_2i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movzwl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: paddd %xmm0, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_2i8_to_2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: paddd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_2i8_to_2i32: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X32-SSE2-LABEL: sext_2i8_to_2i32: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movzwl (%eax), %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: paddd %xmm0, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_2i8_to_2i32: +; X32-SSE41: # %bb.0: +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movzwl (%eax), %eax +; X32-SSE41-NEXT: movd %eax, %xmm0 +; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; X32-SSE41-NEXT: paddd %xmm0, %xmm0 +; X32-SSE41-NEXT: retl + %x = load <2 x i8>, <2 x i8>* %addr, align 1 + %y = sext <2 x i8> %x to <2 x i32> + %z = add <2 x i32>%y, %y + ret <2 x i32>%z +} + +define <4 x i32> @sext_4i17_to_4i32(<4 x i17>* %ptr) { +; SSE2-LABEL: sext_4i17_to_4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq (%rdi), %rax +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shlq $30, %rcx +; SSE2-NEXT: sarq $47, %rcx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shlq $47, %rcx +; SSE2-NEXT: sarq $47, %rcx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movl 8(%rdi), %ecx +; SSE2-NEXT: shll $13, %ecx +; SSE2-NEXT: movq %rax, %rdx +; SSE2-NEXT: shrq $51, %rdx +; SSE2-NEXT: orl %ecx, %edx +; SSE2-NEXT: shlq $47, %rdx +; SSE2-NEXT: sarq $47, %rdx +; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: shlq $13, %rax +; SSE2-NEXT: sarq $47, %rax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_4i17_to_4i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq (%rdi), %rax +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: shlq $30, %rcx +; SSSE3-NEXT: sarq $47, %rcx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: shlq $47, %rcx +; SSSE3-NEXT: sarq $47, %rcx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movl 8(%rdi), %ecx +; SSSE3-NEXT: shll $13, %ecx +; SSSE3-NEXT: movq %rax, %rdx +; SSSE3-NEXT: shrq $51, %rdx +; SSSE3-NEXT: orl %ecx, %edx +; SSSE3-NEXT: shlq $47, %rdx +; SSSE3-NEXT: sarq $47, %rdx +; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: shlq $13, %rax +; SSSE3-NEXT: sarq $47, %rax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_4i17_to_4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movq (%rdi), %rax +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $30, %rcx +; SSE41-NEXT: sarq $47, %rcx +; SSE41-NEXT: movq %rax, %rdx +; SSE41-NEXT: shlq $47, %rdx +; SSE41-NEXT: sarq $47, %rdx +; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $13, %rcx +; SSE41-NEXT: sarq $47, %rcx +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: movl 8(%rdi), %ecx +; SSE41-NEXT: shll $13, %ecx +; SSE41-NEXT: shrq $51, %rax +; SSE41-NEXT: orl %ecx, %eax +; SSE41-NEXT: shlq $47, %rax +; SSE41-NEXT: sarq $47, %rax +; SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_4i17_to_4i32: +; AVX: # %bb.0: +; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shlq $30, %rcx +; AVX-NEXT: sarq $47, %rcx +; AVX-NEXT: movq %rax, %rdx +; AVX-NEXT: shlq $47, %rdx +; AVX-NEXT: sarq $47, %rdx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shlq $13, %rcx +; AVX-NEXT: sarq $47, %rcx +; AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX-NEXT: movl 8(%rdi), %ecx +; AVX-NEXT: shll $13, %ecx +; AVX-NEXT: shrq $51, %rax +; AVX-NEXT: orl %ecx, %eax +; AVX-NEXT: shlq $47, %rax +; AVX-NEXT: sarq $47, %rax +; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X32-SSE2-LABEL: sext_4i17_to_4i32: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movl (%eax), %ecx +; X32-SSE2-NEXT: movl 4(%eax), %edx +; X32-SSE2-NEXT: movl 8(%eax), %eax +; X32-SSE2-NEXT: shldl $13, %edx, %eax +; X32-SSE2-NEXT: shll $15, %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: movl %edx, %eax +; X32-SSE2-NEXT: shll $13, %eax +; X32-SSE2-NEXT: movd %eax, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: shldl $15, %ecx, %edx +; X32-SSE2-NEXT: shll $15, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: shll $15, %edx +; X32-SSE2-NEXT: movd %edx, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE2-NEXT: psrad $15, %xmm0 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_4i17_to_4i32: +; X32-SSE41: # %bb.0: +; X32-SSE41-NEXT: pushl %esi +; X32-SSE41-NEXT: .cfi_def_cfa_offset 8 +; X32-SSE41-NEXT: .cfi_offset %esi, -8 +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movl (%eax), %ecx +; X32-SSE41-NEXT: movl 4(%eax), %edx +; X32-SSE41-NEXT: movl %edx, %esi +; X32-SSE41-NEXT: movl 8(%eax), %eax +; X32-SSE41-NEXT: shldl $13, %edx, %eax +; X32-SSE41-NEXT: shldl $15, %ecx, %edx +; X32-SSE41-NEXT: shll $15, %edx +; X32-SSE41-NEXT: shll $15, %ecx +; X32-SSE41-NEXT: movd %ecx, %xmm0 +; X32-SSE41-NEXT: pinsrd $1, %edx, %xmm0 +; X32-SSE41-NEXT: shll $13, %esi +; X32-SSE41-NEXT: pinsrd $2, %esi, %xmm0 +; X32-SSE41-NEXT: shll $15, %eax +; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; X32-SSE41-NEXT: psrad $15, %xmm0 +; X32-SSE41-NEXT: popl %esi +; X32-SSE41-NEXT: .cfi_def_cfa_offset 4 +; X32-SSE41-NEXT: retl + %a = load <4 x i17>, <4 x i17>* %ptr + %b = sext <4 x i17> %a to <4 x i32> + ret <4 x i32> %b +} + +define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_8i6_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE2-NEXT: paddw {{.*}}(%rip), %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSE2-NEXT: psllq $58, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: psrad $26, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] +; SSE2-NEXT: psllq $58, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: psrad $26, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7] +; SSE2-NEXT: psllq $58, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; SSE2-NEXT: psrad $26, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7] +; SSE2-NEXT: psllq $58, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; SSE2-NEXT: psrad $26, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_8i6_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSSE3-NEXT: paddw {{.*}}(%rip), %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSSE3-NEXT: psllq $58, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3-NEXT: psrad $26, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3] +; SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] +; SSSE3-NEXT: psllq $58, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSSE3-NEXT: psrad $26, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] +; SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7] +; SSSE3-NEXT: psllq $58, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; SSSE3-NEXT: psrad $26, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3] +; SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7] +; SSSE3-NEXT: psllq $58, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; SSSE3-NEXT: psrad $26, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_8i6_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE41-NEXT: paddw {{.*}}(%rip), %xmm3 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; SSE41-NEXT: psllq $58, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: psrad $26, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psllq $58, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: psrad $26, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: psllq $58, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: psrad $26, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; SSE41-NEXT: psllq $58, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: psrad $26, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_8i6_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $10, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $10, %xmm0, %xmm1 +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_8i6_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $10, %xmm0, %xmm0 +; AVX2-NEXT: vpsraw $10, %xmm0, %xmm1 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_8i6_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovd %edi, %xmm0 +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpsllq $58, %zmm0, %zmm0 +; AVX512-NEXT: vpsraq $58, %zmm0, %zmm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: sext_8i6_to_8i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; X32-SSE2-NEXT: paddw {{\.LCPI.*}}, %xmm3 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; X32-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; X32-SSE2-NEXT: psllq $58, %xmm0 +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; X32-SSE2-NEXT: psrad $26, %xmm0 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3] +; X32-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] +; X32-SSE2-NEXT: psllq $58, %xmm1 +; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X32-SSE2-NEXT: psrad $26, %xmm1 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] +; X32-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7] +; X32-SSE2-NEXT: psllq $58, %xmm2 +; X32-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X32-SSE2-NEXT: psrad $31, %xmm4 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; X32-SSE2-NEXT: psrad $26, %xmm2 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3] +; X32-SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7] +; X32-SSE2-NEXT: psllq $58, %xmm3 +; X32-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE2-NEXT: psrad $31, %xmm4 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; X32-SSE2-NEXT: psrad $26, %xmm3 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: sext_8i6_to_8i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; X32-SSE41-NEXT: paddw {{\.LCPI.*}}, %xmm3 +; X32-SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; X32-SSE41-NEXT: psllq $58, %xmm0 +; X32-SSE41-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE41-NEXT: psrad $31, %xmm1 +; X32-SSE41-NEXT: psrad $26, %xmm0 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X32-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] +; X32-SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X32-SSE41-NEXT: psllq $58, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE41-NEXT: psrad $31, %xmm2 +; X32-SSE41-NEXT: psrad $26, %xmm1 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X32-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; X32-SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; X32-SSE41-NEXT: psllq $58, %xmm2 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm4 +; X32-SSE41-NEXT: psrad $31, %xmm4 +; X32-SSE41-NEXT: psrad $26, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X32-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; X32-SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; X32-SSE41-NEXT: psllq $58, %xmm3 +; X32-SSE41-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE41-NEXT: psrad $31, %xmm4 +; X32-SSE41-NEXT: psrad $26, %xmm3 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X32-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; X32-SSE41-NEXT: retl +entry: + %a = trunc i32 %x to i6 + %b = insertelement <8 x i6> undef, i6 %a, i32 0 + %c = shufflevector <8 x i6> %b, <8 x i6> undef, <8 x i32> zeroinitializer + %d = add <8 x i6> %c, <i6 0, i6 1, i6 2, i6 3, i6 4, i6 5, i6 6, i6 7> + %e = sext <8 x i6> %d to <8 x i64> + ret <8 x i64> %e +} + +define <8 x i32> @zext_negate_sext(<8 x i8> %x) { +; SSE2-LABEL: zext_negate_sext: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: psubw %xmm0, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_negate_sext: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: psubw %xmm0, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_negate_sext: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: psubw %xmm0, %xmm1 +; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_negate_sext: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_negate_sext: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_negate_sext: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsubd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: zext_negate_sext: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-SSE2-NEXT: psubw %xmm0, %xmm1 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psrad $16, %xmm1 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: zext_negate_sext: +; X32-SSE41: # %bb.0: +; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X32-SSE41-NEXT: pxor %xmm1, %xmm1 +; X32-SSE41-NEXT: psubw %xmm0, %xmm1 +; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm1 +; X32-SSE41-NEXT: retl + %z = zext <8 x i8> %x to <8 x i16> + %neg = sub nsw <8 x i16> zeroinitializer, %z + %r = sext <8 x i16> %neg to <8 x i32> + ret <8 x i32> %r +} + +define <8 x i32> @zext_decremenet_sext(<8 x i8> %x) { +; SSE2-LABEL: zext_decremenet_sext: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_decremenet_sext: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: paddw %xmm0, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_decremenet_sext: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: paddw %xmm0, %xmm1 +; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_decremenet_sext: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_decremenet_sext: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_decremenet_sext: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE2-LABEL: zext_decremenet_sext: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; X32-SSE2-NEXT: paddw %xmm0, %xmm1 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psrad $16, %xmm1 +; X32-SSE2-NEXT: retl +; +; X32-SSE41-LABEL: zext_decremenet_sext: +; X32-SSE41: # %bb.0: +; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X32-SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; X32-SSE41-NEXT: paddw %xmm0, %xmm1 +; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm1 +; X32-SSE41-NEXT: retl + %z = zext <8 x i8> %x to <8 x i16> + %dec = add <8 x i16> %z, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> + %r = sext <8 x i16> %dec to <8 x i32> + ret <8 x i32> %r +} diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll new file mode 100644 index 00000000000..cd7902ea98d --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll @@ -0,0 +1,2481 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL +; +; Just one 32-bit run to make sure we do reasonable things for i64 shifts. +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 + +; +; Variable Shifts +; + +define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { +; SSE2-LABEL: var_shift_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrad %xmm2, %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrad %xmm4, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psrad %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrad %xmm1, %xmm0 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrad %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: psrad %xmm4, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrad %xmm1, %xmm3 +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] +; SSE41-NEXT: psrad %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: var_shift_v2i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: var_shift_v2i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: var_shift_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: var_shift_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: psrad %xmm2, %xmm3 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psrad %xmm4, %xmm2 +; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] +; X32-SSE-NEXT: movdqa %xmm0, %xmm4 +; X32-SSE-NEXT: psrad %xmm3, %xmm4 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; X32-SSE-NEXT: psrad %xmm1, %xmm0 +; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] +; X32-SSE-NEXT: movaps %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <2 x i32> %a, %b + ret <2 x i32> %shift +} + +define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { +; SSE2-LABEL: var_shift_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $12, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: psllw $12, %xmm0 +; SSE41-NEXT: psllw $4, %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddw %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psraw $8, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $4, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; XOP-LABEL: var_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v4i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v4i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v4i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v4i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v4i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $12, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psraw $8, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psraw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psraw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: psraw $15, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: pandn %xmm0, %xmm2 +; X32-SSE-NEXT: psraw $1, %xmm0 +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: por %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <4 x i16> %a, %b + ret <4 x i16> %shift +} + +define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { +; SSE2-LABEL: var_shift_v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $12, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v2i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: psllw $12, %xmm0 +; SSE41-NEXT: psllw $4, %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddw %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psraw $8, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $4, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; XOP-LABEL: var_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v2i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v2i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v2i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v2i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $12, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psraw $8, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psraw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psraw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: psraw $15, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: pandn %xmm0, %xmm2 +; X32-SSE-NEXT: psraw $1, %xmm0 +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: por %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <2 x i16> %a, %b + ret <2 x i16> %shift +} + +define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { +; SSE2-LABEL: var_shift_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: psraw $4, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: psraw $2, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: psraw $1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $4, %xmm4 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $2, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $4, %xmm2 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: var_shift_v8i8: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v8i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v8i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm6 +; X32-SSE-NEXT: pandn %xmm2, %xmm6 +; X32-SSE-NEXT: psraw $4, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm6, %xmm2 +; X32-SSE-NEXT: paddw %xmm4, %xmm4 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm6 +; X32-SSE-NEXT: pandn %xmm2, %xmm6 +; X32-SSE-NEXT: psraw $2, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm6, %xmm2 +; X32-SSE-NEXT: paddw %xmm4, %xmm4 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm4 +; X32-SSE-NEXT: pandn %xmm2, %xmm4 +; X32-SSE-NEXT: psraw $1, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm4, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa %xmm4, %xmm5 +; X32-SSE-NEXT: pandn %xmm0, %xmm5 +; X32-SSE-NEXT: psraw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm4, %xmm0 +; X32-SSE-NEXT: por %xmm5, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa %xmm4, %xmm5 +; X32-SSE-NEXT: pandn %xmm0, %xmm5 +; X32-SSE-NEXT: psraw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm4, %xmm0 +; X32-SSE-NEXT: por %xmm5, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: psraw $1, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <8 x i8> %a, %b + ret <8 x i8> %shift +} + +define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { +; SSE2-LABEL: var_shift_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: psraw $4, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: psraw $2, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: psraw $1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $4, %xmm4 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $2, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $4, %xmm2 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: var_shift_v4i8: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v4i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm6 +; X32-SSE-NEXT: pandn %xmm2, %xmm6 +; X32-SSE-NEXT: psraw $4, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm6, %xmm2 +; X32-SSE-NEXT: paddw %xmm4, %xmm4 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm6 +; X32-SSE-NEXT: pandn %xmm2, %xmm6 +; X32-SSE-NEXT: psraw $2, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm6, %xmm2 +; X32-SSE-NEXT: paddw %xmm4, %xmm4 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm4 +; X32-SSE-NEXT: pandn %xmm2, %xmm4 +; X32-SSE-NEXT: psraw $1, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm4, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa %xmm4, %xmm5 +; X32-SSE-NEXT: pandn %xmm0, %xmm5 +; X32-SSE-NEXT: psraw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm4, %xmm0 +; X32-SSE-NEXT: por %xmm5, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa %xmm4, %xmm5 +; X32-SSE-NEXT: pandn %xmm0, %xmm5 +; X32-SSE-NEXT: psraw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm4, %xmm0 +; X32-SSE-NEXT: por %xmm5, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: psraw $1, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <4 x i8> %a, %b + ret <4 x i8> %shift +} + +define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { +; SSE2-LABEL: var_shift_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: psraw $4, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: psraw $2, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: psraw $1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $4, %xmm4 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $2, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $4, %xmm2 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: var_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v2i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm6 +; X32-SSE-NEXT: pandn %xmm2, %xmm6 +; X32-SSE-NEXT: psraw $4, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm6, %xmm2 +; X32-SSE-NEXT: paddw %xmm4, %xmm4 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm6 +; X32-SSE-NEXT: pandn %xmm2, %xmm6 +; X32-SSE-NEXT: psraw $2, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm6, %xmm2 +; X32-SSE-NEXT: paddw %xmm4, %xmm4 +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa %xmm5, %xmm4 +; X32-SSE-NEXT: pandn %xmm2, %xmm4 +; X32-SSE-NEXT: psraw $1, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: por %xmm4, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa %xmm4, %xmm5 +; X32-SSE-NEXT: pandn %xmm0, %xmm5 +; X32-SSE-NEXT: psraw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm4, %xmm0 +; X32-SSE-NEXT: por %xmm5, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 +; X32-SSE-NEXT: movdqa %xmm4, %xmm5 +; X32-SSE-NEXT: pandn %xmm0, %xmm5 +; X32-SSE-NEXT: psraw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm4, %xmm0 +; X32-SSE-NEXT: por %xmm5, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: psraw $1, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <2 x i8> %a, %b + ret <2 x i8> %shift +} + +; +; Uniform Variable Shifts +; + +define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE2-NEXT: psrad %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: psrad %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatvar_shift_v2i32: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOP-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatvar_shift_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_shift_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512VL-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: xorps %xmm2, %xmm2 +; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; X32-SSE-NEXT: psrad %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer + %shift = ashr <2 x i32> %a, %splat + ret <2 x i32> %shift +} + +define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psraw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psraw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatvar_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatvar_shift_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_shift_v4i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v4i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psraw %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer + %shift = ashr <4 x i16> %a, %splat + ret <4 x i16> %shift +} + +define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psraw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v2i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psraw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatvar_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatvar_shift_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_shift_v2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v2i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psraw %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer + %shift = ashr <2 x i16> %a, %splat + ret <2 x i16> %shift +} + +define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: psubb %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_shift_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_shift_v8i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_shift_v8i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512DQ-LABEL: splatvar_shift_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_shift_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_shift_v8i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v8i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: pxor %xmm2, %xmm0 +; X32-SSE-NEXT: psubb %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer + %shift = ashr <8 x i8> %a, %splat + ret <8 x i8> %shift +} + +define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: psubb %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_shift_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_shift_v4i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_shift_v4i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512DQ-LABEL: splatvar_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v4i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: pxor %xmm2, %xmm0 +; X32-SSE-NEXT: psubb %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer + %shift = ashr <4 x i8> %a, %splat + ret <4 x i8> %shift +} + +define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: psubb %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_shift_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOP-LABEL: splatvar_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: splatvar_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v2i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: pxor %xmm2, %xmm0 +; X32-SSE-NEXT: psubb %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer + %shift = ashr <2 x i8> %a, %splat + ret <2 x i8> %shift +} + +; +; Constant Shifts +; + +define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind { +; SSE2-LABEL: constant_shift_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $4, %xmm1 +; SSE2-NEXT: psrad $5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrad $5, %xmm1 +; SSE41-NEXT: psrad $4, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $4, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: constant_shift_v2i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: constant_shift_v2i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: constant_shift_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: constant_shift_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: psrad $4, %xmm1 +; X32-SSE-NEXT: psrad $5, %xmm0 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <2 x i32> %a, <i32 4, i32 5> + ret <2 x i32> %shift +} + +define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { +; SSE2-LABEL: constant_shift_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psraw $2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: psraw $1, %xmm1 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,u,u,u,u> +; SSE41-NEXT: pmulhw %xmm0, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE41-NEXT: psraw $1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: constant_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: retq +; +; XOP-LABEL: constant_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v4i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v4i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u> +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v4i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v4i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v4i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: psraw $2, %xmm1 +; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; X32-SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535] +; X32-SSE-NEXT: movaps %xmm1, %xmm0 +; X32-SSE-NEXT: andps %xmm2, %xmm0 +; X32-SSE-NEXT: psraw $1, %xmm1 +; X32-SSE-NEXT: andnps %xmm1, %xmm2 +; X32-SSE-NEXT: orps %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3> + ret <4 x i16> %shift +} + +define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { +; SSE2-LABEL: constant_shift_v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psraw $3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v2i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psraw $3, %xmm1 +; SSE41-NEXT: psraw $2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: constant_shift_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsraw $3, %xmm0, %xmm1 +; AVX-NEXT: vpsraw $2, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX-NEXT: retq +; +; XOP-LABEL: constant_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpsraw $3, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpsraw $2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v2i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u> +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v2i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpsraw $3, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpsraw $2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v2i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v2i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: psraw $3, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; X32-SSE-NEXT: psraw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pandn %xmm1, %xmm2 +; X32-SSE-NEXT: por %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <2 x i16> %a, <i16 2, i16 3> + ret <2 x i16> %shift +} + +define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { +; SSE-LABEL: constant_shift_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: constant_shift_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; XOP-LABEL: constant_shift_v8i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v8i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v8i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: psraw $8, %xmm0 +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> + ret <8 x i8> %shift +} + +define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { +; SSE-LABEL: constant_shift_v4i8: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: constant_shift_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; XOP-LABEL: constant_shift_v4i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v4i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: psraw $8, %xmm0 +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3> + ret <4 x i8> %shift +} + +define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { +; SSE-LABEL: constant_shift_v2i8: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: constant_shift_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; XOP-LABEL: constant_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v2i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: psraw $8, %xmm0 +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <2 x i8> %a, <i8 2, i8 3> + ret <2 x i8> %shift +} + +; +; Uniform Constant Shifts +; + +define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: psrad $5, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpsrad $5, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v2i32: +; XOP: # %bb.0: +; XOP-NEXT: vpsrad $5, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrad $5, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrad $5, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psrad $5, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <2 x i32> %a, <i32 5, i32 5> + ret <2 x i32> %shift +} + +define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: psraw $3, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpsraw $3, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v4i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v4i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psraw $3, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3> + ret <4 x i16> %shift +} + +define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: psraw $3, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpsraw $3, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v2i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psraw $3, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <2 x i16> %a, <i16 3, i16 3> + ret <2 x i16> %shift +} + +define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v8i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v8i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psrlw $3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X32-SSE-NEXT: pxor %xmm1, %xmm0 +; X32-SSE-NEXT: psubb %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + ret <8 x i8> %shift +} + +define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v4i8: +; SSE: # %bb.0: +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v4i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v4i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v4i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psrlw $3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X32-SSE-NEXT: pxor %xmm1, %xmm0 +; X32-SSE-NEXT: psubb %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3> + ret <4 x i8> %shift +} + +define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v2i8: +; SSE: # %bb.0: +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v2i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v2i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psrlw $3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X32-SSE-NEXT: pxor %xmm1, %xmm0 +; X32-SSE-NEXT: psubb %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = ashr <2 x i8> %a, <i8 3, i8 3> + ret <2 x i8> %shift +} diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll new file mode 100644 index 00000000000..79f8cc6a05f --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll @@ -0,0 +1,2151 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL +; +; Just one 32-bit run to make sure we do reasonable things for i64 shifts. +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 + +; +; Variable Shifts +; + +define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { +; SSE2-LABEL: var_shift_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrld %xmm2, %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrld %xmm4, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psrld %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm1, %xmm0 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrld %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: psrld %xmm4, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrld %xmm1, %xmm3 +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] +; SSE41-NEXT: psrld %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: var_shift_v2i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: var_shift_v2i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: var_shift_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: var_shift_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: psrld %xmm2, %xmm3 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psrld %xmm4, %xmm2 +; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] +; X32-SSE-NEXT: movdqa %xmm0, %xmm4 +; X32-SSE-NEXT: psrld %xmm3, %xmm4 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; X32-SSE-NEXT: psrld %xmm1, %xmm0 +; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] +; X32-SSE-NEXT: movaps %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <2 x i32> %a, %b + ret <2 x i32> %shift +} + +define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { +; SSE2-LABEL: var_shift_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $12, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: psllw $12, %xmm0 +; SSE41-NEXT: psllw $4, %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddw %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psrlw $8, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $4, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $2, %xmm2 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $1, %xmm2 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; XOP-LABEL: var_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v4i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v4i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v4i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v4i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v4i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $12, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psrlw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psrlw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: psraw $15, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: pandn %xmm0, %xmm2 +; X32-SSE-NEXT: psrlw $1, %xmm0 +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: por %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <4 x i16> %a, %b + ret <4 x i16> %shift +} + +define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { +; SSE2-LABEL: var_shift_v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $12, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v2i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: psllw $12, %xmm0 +; SSE41-NEXT: psllw $4, %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddw %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psrlw $8, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $4, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $2, %xmm2 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $1, %xmm2 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; XOP-LABEL: var_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v2i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v2i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v2i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v2i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $12, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psrlw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psraw $15, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pandn %xmm0, %xmm3 +; X32-SSE-NEXT: psrlw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm3, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm1 +; X32-SSE-NEXT: psraw $15, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: pandn %xmm0, %xmm2 +; X32-SSE-NEXT: psrlw $1, %xmm0 +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: por %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <2 x i16> %a, %b + ret <2 x i16> %shift +} + +define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { +; SSE2-LABEL: var_shift_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $2, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $1, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: var_shift_v8i8: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v8i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v8i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psrlw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psrlw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: psrlw $1, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <8 x i8> %a, %b + ret <8 x i8> %shift +} + +define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { +; SSE2-LABEL: var_shift_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $2, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $1, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: var_shift_v4i8: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v4i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psrlw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psrlw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: psrlw $1, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <4 x i8> %a, %b + ret <4 x i8> %shift +} + +define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { +; SSE2-LABEL: var_shift_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $2, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $1, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: var_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v2i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psrlw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psrlw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: psrlw $1, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <2 x i8> %a, %b + ret <2 x i8> %shift +} + +; +; Uniform Variable Shifts +; + +define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE2-NEXT: psrld %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: psrld %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatvar_shift_v2i32: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOP-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatvar_shift_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_shift_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512VL-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: xorps %xmm2, %xmm2 +; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; X32-SSE-NEXT: psrld %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer + %shift = lshr <2 x i32> %a, %splat + ret <2 x i32> %shift +} + +define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatvar_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatvar_shift_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_shift_v4i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v4i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer + %shift = lshr <4 x i16> %a, %splat + ret <4 x i16> %shift +} + +define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v2i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatvar_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatvar_shift_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_shift_v2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v2i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer + %shift = lshr <2 x i16> %a, %splat + ret <2 x i16> %shift +} + +define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_shift_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_shift_v8i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_shift_v8i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512DQ-LABEL: splatvar_shift_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_shift_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_shift_v8i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v8i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer + %shift = lshr <8 x i8> %a, %splat + ret <8 x i8> %shift +} + +define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_shift_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_shift_v4i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_shift_v4i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512DQ-LABEL: splatvar_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v4i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer + %shift = lshr <4 x i8> %a, %splat + ret <4 x i8> %shift +} + +define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_shift_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOP-LABEL: splatvar_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: splatvar_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v2i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psrlw %xmm1, %xmm2 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer + %shift = lshr <2 x i8> %a, %splat + ret <2 x i8> %shift +} + +; +; Constant Shifts +; + +define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind { +; SSE2-LABEL: constant_shift_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $4, %xmm1 +; SSE2-NEXT: psrld $5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $5, %xmm1 +; SSE41-NEXT: psrld $4, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrld $5, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $4, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: constant_shift_v2i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: constant_shift_v2i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: constant_shift_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: constant_shift_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: psrld $4, %xmm1 +; X32-SSE-NEXT: psrld $5, %xmm0 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <2 x i32> %a, <i32 4, i32 5> + ret <2 x i32> %shift +} + +define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { +; SSE2-LABEL: constant_shift_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,u,u,u,u> +; SSE41-NEXT: pmulhuw %xmm0, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: constant_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: retq +; +; XOP-LABEL: constant_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v4i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v4i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u> +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v4i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v4i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v4i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: pandn %xmm0, %xmm2 +; X32-SSE-NEXT: pmulhuw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: por %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3> + ret <4 x i16> %shift +} + +define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { +; SSE2-LABEL: constant_shift_v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v2i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $3, %xmm1 +; SSE41-NEXT: psrlw $2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: constant_shift_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm1 +; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX-NEXT: retq +; +; XOP-LABEL: constant_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpsrlw $3, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v2i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u> +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v2i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v2i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v2i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: psrlw $3, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; X32-SSE-NEXT: psrlw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pandn %xmm1, %xmm2 +; X32-SSE-NEXT: por %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <2 x i16> %a, <i16 2, i16 3> + ret <2 x i16> %shift +} + +define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { +; SSE2-LABEL: constant_shift_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; XOP-LABEL: constant_shift_v8i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v8i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v8i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> + ret <8 x i8> %shift +} + +define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { +; SSE2-LABEL: constant_shift_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; XOP-LABEL: constant_shift_v4i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v4i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3> + ret <4 x i8> %shift +} + +define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { +; SSE2-LABEL: constant_shift_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; XOP-LABEL: constant_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v2i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <2 x i8> %a, <i8 2, i8 3> + ret <2 x i8> %shift +} + +; +; Uniform Constant Shifts +; + +define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: psrld $5, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $5, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v2i32: +; XOP: # %bb.0: +; XOP-NEXT: vpsrld $5, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $5, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrld $5, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psrld $5, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <2 x i32> %a, <i32 5, i32 5> + ret <2 x i32> %shift +} + +define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v4i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v4i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psrlw $3, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3> + ret <4 x i16> %shift +} + +define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v2i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psrlw $3, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <2 x i16> %a, <i16 3, i16 3> + ret <2 x i16> %shift +} + +define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v8i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v8i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psrlw $3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + ret <8 x i8> %shift +} + +define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v4i8: +; SSE: # %bb.0: +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v4i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v4i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v4i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psrlw $3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3> + ret <4 x i8> %shift +} + +define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v2i8: +; SSE: # %bb.0: +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v2i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v2i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psrlw $3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: retl + %shift = lshr <2 x i8> %a, <i8 3, i8 3> + ret <2 x i8> %shift +} diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128-widen.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128-widen.ll new file mode 100644 index 00000000000..1323f66bdc4 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128-widen.ll @@ -0,0 +1,1944 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL +; +; Just one 32-bit run to make sure we do reasonable things for i64 shifts. +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 + +; +; Variable Shifts +; + +define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { +; SSE2-LABEL: var_shift_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $23, %xmm1 +; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pslld $23, %xmm1 +; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: var_shift_v2i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: var_shift_v2i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: var_shift_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: var_shift_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslld $23, %xmm1 +; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE-NEXT: retl + %shift = shl <2 x i32> %a, %b + ret <2 x i32> %shift +} + +define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { +; SSE2-LABEL: var_shift_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: pslld $23, %xmm1 +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT: pslld $23, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm3, %xmm1 +; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE41-NEXT: pslld $23, %xmm2 +; SSE41-NEXT: paddd %xmm3, %xmm2 +; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE41-NEXT: packusdw %xmm1, %xmm2 +; SSE41-NEXT: pmullw %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; XOP-LABEL: var_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v4i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v4i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v4i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v4i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v4i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: movdqa %xmm1, %xmm3 +; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X32-SSE-NEXT: pslld $23, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; X32-SSE-NEXT: paddd %xmm4, %xmm3 +; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X32-SSE-NEXT: pslld $23, %xmm1 +; X32-SSE-NEXT: paddd %xmm4, %xmm1 +; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; X32-SSE-NEXT: pmullw %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <4 x i16> %a, %b + ret <4 x i16> %shift +} + +define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { +; SSE2-LABEL: var_shift_v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: pslld $23, %xmm1 +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v2i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT: pslld $23, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: paddd %xmm3, %xmm1 +; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE41-NEXT: pslld $23, %xmm2 +; SSE41-NEXT: paddd %xmm3, %xmm2 +; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE41-NEXT: packusdw %xmm1, %xmm2 +; SSE41-NEXT: pmullw %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; XOP-LABEL: var_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v2i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v2i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v2i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v2i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: movdqa %xmm1, %xmm3 +; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X32-SSE-NEXT: pslld $23, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; X32-SSE-NEXT: paddd %xmm4, %xmm3 +; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X32-SSE-NEXT: pslld $23, %xmm1 +; X32-SSE-NEXT: paddd %xmm4, %xmm1 +; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; X32-SSE-NEXT: pmullw %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <2 x i16> %a, %b + ret <2 x i16> %shift +} + +define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { +; SSE2-LABEL: var_shift_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psllw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psllw $2, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: var_shift_v8i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v8i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v8i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psllw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psllw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: paddb %xmm0, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <8 x i8> %a, %b + ret <8 x i8> %shift +} + +define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { +; SSE2-LABEL: var_shift_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psllw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psllw $2, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: var_shift_v4i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v4i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psllw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psllw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: paddb %xmm0, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <4 x i8> %a, %b + ret <4 x i8> %shift +} + +define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { +; SSE2-LABEL: var_shift_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psllw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psllw $2, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: var_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: var_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: var_shift_v2i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $5, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psllw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pandn %xmm0, %xmm4 +; X32-SSE-NEXT: psllw $2, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: paddb %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE-NEXT: pandn %xmm0, %xmm1 +; X32-SSE-NEXT: paddb %xmm0, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <2 x i8> %a, %b + ret <2 x i8> %shift +} + +; +; Uniform Variable Shifts +; + +define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE2-NEXT: pslld %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pslld %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatvar_shift_v2i32: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOP-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatvar_shift_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_shift_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512VL-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: xorps %xmm2, %xmm2 +; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; X32-SSE-NEXT: pslld %xmm2, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer + %shift = shl <2 x i32> %a, %splat + ret <2 x i32> %shift +} + +define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psllw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psllw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatvar_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatvar_shift_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_shift_v4i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v4i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psllw %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer + %shift = shl <4 x i16> %a, %splat + ret <4 x i16> %shift +} + +define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psllw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v2i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psllw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatvar_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatvar_shift_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_shift_v2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v2i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psllw %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer + %shift = shl <2 x i16> %a, %splat + ret <2 x i16> %shift +} + +define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psllw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psllw %xmm1, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psllw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psllw %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_shift_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_shift_v8i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_shift_v8i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512DQ-LABEL: splatvar_shift_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_shift_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_shift_v8i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v8i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psllw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psllw %xmm1, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer + %shift = shl <8 x i8> %a, %splat + ret <8 x i8> %shift +} + +define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psllw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psllw %xmm1, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psllw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psllw %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_shift_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_shift_v4i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_shift_v4i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512DQ-LABEL: splatvar_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v4i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psllw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psllw %xmm1, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer + %shift = shl <4 x i8> %a, %splat + ret <4 x i8> %shift +} + +define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { +; SSE2-LABEL: splatvar_shift_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psllw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psllw %xmm1, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psllw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psllw %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_shift_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOP-LABEL: splatvar_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] +; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: splatvar_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: splatvar_shift_v2i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psllw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X32-SSE-NEXT: psllw %xmm1, %xmm2 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer + %shift = shl <2 x i8> %a, %splat + ret <2 x i8> %shift +} + +; +; Constant Shifts +; + +define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind { +; SSE2-LABEL: constant_shift_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pslld $4, %xmm1 +; SSE2-NEXT: pslld $5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pslld $5, %xmm1 +; SSE41-NEXT: pslld $4, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpslld $5, %xmm0, %xmm1 +; AVX1-NEXT: vpslld $4, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: constant_shift_v2i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: constant_shift_v2i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: constant_shift_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: constant_shift_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: pslld $4, %xmm1 +; X32-SSE-NEXT: pslld $5, %xmm0 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <2 x i32> %a, <i32 4, i32 5> + ret <2 x i32> %shift +} + +define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { +; SSE-LABEL: constant_shift_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: constant_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: constant_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v4i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v4i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u> +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v4i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v4i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v4i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3> + ret <4 x i16> %shift +} + +define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { +; SSE2-LABEL: constant_shift_v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v2i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psllw $3, %xmm1 +; SSE41-NEXT: psllw $2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: constant_shift_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $3, %xmm0, %xmm1 +; AVX-NEXT: vpsllw $2, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX-NEXT: retq +; +; XOP-LABEL: constant_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpsllw $3, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpsllw $2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v2i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u> +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v2i16: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpsllw $3, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpsllw $2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v2i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v2i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <2 x i16> %a, <i16 2, i16 3> + ret <2 x i16> %shift +} + +define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { +; SSE2-LABEL: constant_shift_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; XOP-LABEL: constant_shift_v8i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v8i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v8i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: packuswb %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> + ret <8 x i8> %shift +} + +define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { +; SSE2-LABEL: constant_shift_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; XOP-LABEL: constant_shift_v4i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v4i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: packuswb %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3> + ret <4 x i8> %shift +} + +define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { +; SSE2-LABEL: constant_shift_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; XOP-LABEL: constant_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: constant_shift_v2i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: packuswb %xmm1, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <2 x i8> %a, <i8 2, i8 3> + ret <2 x i8> %shift +} + +; +; Uniform Constant Shifts +; + +define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: pslld $5, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $5, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v2i32: +; XOP: # %bb.0: +; XOP-NEXT: vpslld $5, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $5, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpslld $5, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v2i32: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pslld $5, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <2 x i32> %a, <i32 5, i32 5> + ret <2 x i32> %shift +} + +define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: psllw $3, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v4i16: +; XOP: # %bb.0: +; XOP-NEXT: vpsllw $3, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v4i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v4i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $3, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3> + ret <4 x i16> %shift +} + +define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v2i16: +; SSE: # %bb.0: +; SSE-NEXT: psllw $3, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v2i16: +; XOP: # %bb.0: +; XOP-NEXT: vpsllw $3, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v2i16: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $3, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <2 x i16> %a, <i16 3, i16 3> + ret <2 x i16> %shift +} + +define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: psllw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v8i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v8i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + ret <8 x i8> %shift +} + +define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v4i8: +; SSE: # %bb.0: +; SSE-NEXT: psllw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v4i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v4i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v4i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3> + ret <4 x i8> %shift +} + +define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { +; SSE-LABEL: splatconstant_shift_v2i8: +; SSE: # %bb.0: +; SSE-NEXT: psllw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatconstant_shift_v2i8: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatconstant_shift_v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_shift_v2i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: splatconstant_shift_v2i8: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: psllw $3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: retl + %shift = shl <2 x i8> %a, <i8 3, i8 3> + ret <2 x i8> %shift +} diff --git a/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll new file mode 100644 index 00000000000..1c2813d35f0 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll @@ -0,0 +1,5197 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ + +; +; add +; + +define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { +; SSE-LABEL: trunc_add_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: paddq %xmm3, %xmm1 +; SSE-NEXT: paddq %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_add_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <4 x i64> %a0, %a1 + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { +; SSE-LABEL: trunc_add_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: paddq %xmm6, %xmm2 +; SSE-NEXT: paddq %xmm7, %xmm3 +; SSE-NEXT: paddq %xmm4, %xmm0 +; SSE-NEXT: paddq %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_add_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <8 x i64> %a0, %a1 + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { +; SSE-LABEL: trunc_add_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_add_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_add_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <8 x i32> %a0, %a1 + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { +; SSE-LABEL: trunc_add_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 +; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpaddq %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddq %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddq %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpaddq %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_add_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <16 x i64> %a0, %a1 + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; SSE-LABEL: trunc_add_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: paddd %xmm4, %xmm0 +; SSE-NEXT: paddd %xmm5, %xmm1 +; SSE-NEXT: paddd %xmm6, %xmm2 +; SSE-NEXT: paddd %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_add_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_add_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <16 x i32> %a0, %a1 + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { +; SSE-LABEL: trunc_add_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: paddw %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_add_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_add_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_add_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_add_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = add <16 x i16> %a0, %a1 + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) { +; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8: +; SSE: # %bb.0: +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: packssdw %xmm2, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %2 = sext <8 x i8> %1 to <8 x i32> + %3 = add <8 x i32> %2, %a1 + %4 = trunc <8 x i32> %3 to <8 x i16> + ret <8 x i16> %4 +} + +; +; add to constant +; + +define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { +; SSE-LABEL: trunc_add_const_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_const_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_add_const_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { +; SSE-LABEL: trunc_add_const_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_const_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_add_const_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { +; SSE-LABEL: trunc_add_const_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_const_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_add_const_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_add_const_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { +; SSE-LABEL: trunc_add_const_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_const_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_add_const_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { +; SSE-LABEL: trunc_add_const_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_const_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_add_const_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_add_const_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { +; SSE-LABEL: trunc_add_const_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_const_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_add_const_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_add_const_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; sub +; + +define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { +; SSE-LABEL: trunc_sub_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: psubq %xmm3, %xmm1 +; SSE-NEXT: psubq %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_sub_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <4 x i64> %a0, %a1 + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { +; SSE-LABEL: trunc_sub_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: psubq %xmm6, %xmm2 +; SSE-NEXT: psubq %xmm7, %xmm3 +; SSE-NEXT: psubq %xmm4, %xmm0 +; SSE-NEXT: psubq %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_sub_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <8 x i64> %a0, %a1 + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { +; SSE-LABEL: trunc_sub_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: psubd %xmm2, %xmm0 +; SSE-NEXT: psubd %xmm3, %xmm1 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_sub_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_sub_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <8 x i32> %a0, %a1 + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { +; SSE-LABEL: trunc_sub_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 +; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsubq %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpsubq %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpsubq %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpsubq %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_sub_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <16 x i64> %a0, %a1 + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; SSE-LABEL: trunc_sub_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: psubd %xmm4, %xmm0 +; SSE-NEXT: psubd %xmm5, %xmm1 +; SSE-NEXT: psubd %xmm6, %xmm2 +; SSE-NEXT: psubd %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_sub_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_sub_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <16 x i32> %a0, %a1 + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { +; SSE-LABEL: trunc_sub_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: psubw %xmm2, %xmm0 +; SSE-NEXT: psubw %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_sub_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_sub_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_sub_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = sub <16 x i16> %a0, %a1 + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) { +; SSE-LABEL: trunc_ext_sub_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: trunc_ext_sub_v16i16_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %a = zext <16 x i8> %x to <16 x i16> + %b = zext <16 x i8> %y to <16 x i16> + %c = sub <16 x i16> %a, %b + %d = trunc <16 x i16> %c to <16 x i8> + ret <16 x i8> %d +} + +; +; sub to constant +; + +define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { +; SSE-LABEL: trunc_sub_const_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_const_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_sub_const_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { +; SSE-LABEL: trunc_sub_const_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_const_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_sub_const_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind { +; SSE-LABEL: trunc_sub_const_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_const_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_sub_const_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_sub_const_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { +; SSE-LABEL: trunc_sub_const_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_sub_const_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { +; SSE-LABEL: trunc_sub_const_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_sub_const_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_sub_const_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { +; SSE-LABEL: trunc_sub_const_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_const_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_sub_const_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) { +; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = zext <16 x i8> %x to <16 x i16> + %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> + %c = trunc <16 x i16> %b to <16 x i8> + ret <16 x i8> %c +} + +define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) { +; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; SSE-NEXT: psubb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %a = zext <16 x i8> %x to <16 x i16> + %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a + %c = trunc <16 x i16> %b to <16 x i8> + ret <16 x i8> %c +} + +; +; mul +; + +define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { +; SSE-LABEL: trunc_mul_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_mul_v4i64_v4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_mul_v4i64_v4i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = mul <4 x i64> %a0, %a1 + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { +; SSE-LABEL: trunc_mul_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: pmullw %xmm6, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_mul_v8i64_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_mul_v8i64_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = mul <8 x i64> %a0, %a1 + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { +; SSE-LABEL: trunc_mul_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_mul_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_mul_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = mul <8 x i32> %a0, %a1 + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { +; SSE-LABEL: trunc_mul_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 +; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm7[0,2],xmm8[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2] +; AVX2-SLOW-NEXT: vpmulld %xmm8, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2] +; AVX2-SLOW-NEXT: vpmulld %xmm6, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm6, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] +; AVX2-SLOW-NEXT: vpmulld %xmm5, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vpmulld %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpand %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpmulld %xmm7, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpmulld %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vpmulld %xmm5, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpmulld %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpand %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_mul_v16i64_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovqd %zmm3, %ymm3 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_mul_v16i64_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovqd %zmm3, %ymm3 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX512BW-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = mul <16 x i64> %a0, %a1 + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; SSE-LABEL: trunc_mul_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE-NEXT: pmuludq %xmm8, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE-NEXT: pmuludq %xmm6, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE-NEXT: pmuludq %xmm7, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_mul_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_mul_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = mul <16 x i32> %a0, %a1 + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { +; SSE-LABEL: trunc_mul_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pmullw %xmm2, %xmm0 +; SSE-NEXT: pmullw %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_mul_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_mul_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_mul_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = mul <16 x i16> %a0, %a1 + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) { +; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: packssdw %xmm2, %xmm1 +; SSE-NEXT: pmullw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = mul <8 x i32> %2, %a1 + %4 = trunc <8 x i32> %3 to <8 x i16> + ret <8 x i16> %4 +} + +; +; mul to constant +; + +define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { +; SSE-LABEL: trunc_mul_const_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_mul_const_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { +; SSE-LABEL: trunc_mul_const_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_const_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_mul_const_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind { +; SSE-LABEL: trunc_mul_const_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_const_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_mul_const_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_mul_const_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { +; SSE-LABEL: trunc_mul_const_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm4 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm5 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm6 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm3, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 +; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { +; SSE-LABEL: trunc_mul_const_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE-NEXT: pmuludq %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE-NEXT: pmuludq %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,9,10,11] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE-NEXT: pmuludq %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [12,13,14,15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE-NEXT: pmuludq %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_const_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_mul_const_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_mul_const_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { +; SSE-LABEL: trunc_mul_const_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_const_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_mul_const_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; and +; + +define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { +; SSE-LABEL: trunc_and_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: andps %xmm3, %xmm1 +; SSE-NEXT: andps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_and_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <4 x i64> %a0, %a1 + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { +; SSE-LABEL: trunc_and_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_and_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <8 x i64> %a0, %a1 + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { +; SSE-LABEL: trunc_and_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_and_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_and_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <8 x i32> %a0, %a1 + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { +; SSE-LABEL: trunc_and_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255] +; AVX1-NEXT: vandps %ymm8, %ymm7, %ymm7 +; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 +; AVX1-NEXT: vpackusdw %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vandps %ymm8, %ymm6, %ymm6 +; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vpackusdw %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %ymm8, %ymm5, %ymm3 +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm8, %ymm4, %ymm3 +; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_and_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <16 x i64> %a0, %a1 + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; SSE-LABEL: trunc_and_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm5, %xmm0 +; SSE-NEXT: packuswb %xmm6, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_and_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_and_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <16 x i32> %a0, %a1 + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { +; SSE-LABEL: trunc_and_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_and_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_and_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_and_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_and_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = and <16 x i16> %a0, %a1 + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; and to constant +; + +define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { +; SSE-LABEL: trunc_and_const_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_const_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_and_const_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { +; SSE-LABEL: trunc_and_const_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: andpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_const_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_and_const_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { +; SSE-LABEL: trunc_and_const_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_const_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_and_const_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_and_const_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { +; SSE-LABEL: trunc_and_const_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_const_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_and_const_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { +; SSE-LABEL: trunc_and_const_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_const_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_and_const_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_and_const_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { +; SSE-LABEL: trunc_and_const_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_const_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_and_const_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_and_const_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; xor +; + +define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { +; SSE-LABEL: trunc_xor_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: xorps %xmm3, %xmm1 +; SSE-NEXT: xorps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_xor_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <4 x i64> %a0, %a1 + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { +; SSE-LABEL: trunc_xor_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm6, %xmm2 +; SSE-NEXT: pxor %xmm7, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_xor_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <8 x i64> %a0, %a1 + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { +; SSE-LABEL: trunc_xor_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_xor_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_xor_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <8 x i32> %a0, %a1 + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { +; SSE-LABEL: trunc_xor_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2 +; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3 +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vxorps %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vxorps %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vxorps %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vxorps %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpxor %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpxor %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpxor %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_xor_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <16 x i64> %a0, %a1 + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; SSE-LABEL: trunc_xor_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm5, %xmm1 +; SSE-NEXT: pxor %xmm6, %xmm2 +; SSE-NEXT: pxor %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_xor_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_xor_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <16 x i32> %a0, %a1 + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { +; SSE-LABEL: trunc_xor_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_xor_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_xor_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_xor_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = xor <16 x i16> %a0, %a1 + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; xor to constant +; + +define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { +; SSE-LABEL: trunc_xor_const_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_const_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_xor_const_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { +; SSE-LABEL: trunc_xor_const_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: xorpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_xor_const_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { +; SSE-LABEL: trunc_xor_const_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_const_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_xor_const_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_xor_const_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { +; SSE-LABEL: trunc_xor_const_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_xor_const_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { +; SSE-LABEL: trunc_xor_const_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_xor_const_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_xor_const_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { +; SSE-LABEL: trunc_xor_const_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_const_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_xor_const_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; or +; + +define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { +; SSE-LABEL: trunc_or_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: orps %xmm3, %xmm1 +; SSE-NEXT: orps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_or_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <4 x i64> %a0, %a1 + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { +; SSE-LABEL: trunc_or_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_or_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <8 x i64> %a0, %a1 + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { +; SSE-LABEL: trunc_or_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_or_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_or_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <8 x i32> %a0, %a1 + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { +; SSE-LABEL: trunc_or_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2 +; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3 +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vorps %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vorps %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vorps %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vorps %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpor %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpor %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpor %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_or_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <16 x i64> %a0, %a1 + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; SSE-LABEL: trunc_or_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_or_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_or_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <16 x i32> %a0, %a1 + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { +; SSE-LABEL: trunc_or_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_or_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_or_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_or_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_or_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = or <16 x i16> %a0, %a1 + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; or to constant +; + +define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { +; SSE-LABEL: trunc_or_const_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: orps {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_const_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_or_const_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { +; SSE-LABEL: trunc_or_const_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: orpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_const_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_or_const_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { +; SSE-LABEL: trunc_or_const_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_const_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_or_const_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_or_const_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { +; SSE-LABEL: trunc_or_const_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_const_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_or_const_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { +; SSE-LABEL: trunc_or_const_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_const_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_or_const_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_or_const_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { +; SSE-LABEL: trunc_or_const_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_const_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_or_const_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_or_const_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; complex patterns - often created by vectorizer +; + +define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { +; SSE-LABEL: mul_add_const_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: mul_add_const_v4i64_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = sext <4 x i32> %a0 to <4 x i64> + %2 = sext <4 x i32> %a1 to <4 x i64> + %3 = mul <4 x i64> %1, %2 + %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3> + %5 = trunc <4 x i64> %4 to <4 x i32> + ret <4 x i32> %5 +} + +define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { +; SSE-LABEL: mul_add_self_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: paddd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: mul_add_self_v4i64_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = sext <4 x i32> %a0 to <4 x i64> + %2 = sext <4 x i32> %a1 to <4 x i64> + %3 = mul <4 x i64> %1, %2 + %4 = add <4 x i64> %3, %3 + %5 = trunc <4 x i64> %4 to <4 x i32> + ret <4 x i32> %5 +} + +define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { +; SSE-LABEL: mul_add_multiuse_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2] +; SSE-NEXT: paddd %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: mul_add_multiuse_v4i64_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = sext <4 x i32> %a0 to <4 x i64> + %2 = sext <4 x i32> %a1 to <4 x i64> + %3 = mul <4 x i64> %1, %2 + %4 = add <4 x i64> %1, %3 + %5 = trunc <4 x i64> %4 to <4 x i32> + ret <4 x i32> %5 +} diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll new file mode 100644 index 00000000000..eb0a32fee08 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll @@ -0,0 +1,3079 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL + +; +; PACKUS saturation truncation to vXi32 +; + +define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { +; SSE2-LABEL: trunc_packus_v4i64_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_packus_v4i64_v4i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v4i64_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647] +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: xorpd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm4, %xmm2 +; SSE41-NEXT: xorpd %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: movapd %xmm5, %xmm4 +; SSE41-NEXT: xorpd %xmm3, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm2 +; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_packus_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_packus_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_packus_v4i64_v4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v4i64_v4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v4i64_v4i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> + %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> + %3 = icmp sgt <4 x i64> %2, zeroinitializer + %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer + %5 = trunc <4 x i64> %4 to <4 x i32> + ret <4 x i32> %5 +} + + +define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) { +; SSE2-LABEL: trunc_packus_v8i64_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm10, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm6 +; SSE2-NEXT: por %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_packus_v8i64_v8i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm10, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647] +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm6 +; SSSE3-NEXT: por %xmm2, %xmm6 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm6, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm10, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v8i64_v8i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm9 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [4294967295,4294967295] +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm8 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE41-NEXT: xorpd %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: xorpd %xmm10, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm3 +; SSE41-NEXT: movapd %xmm4, %xmm1 +; SSE41-NEXT: xorpd %xmm10, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] +; SSE41-NEXT: movapd %xmm9, %xmm3 +; SSE41-NEXT: xorpd %xmm10, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3 +; SSE41-NEXT: movapd %xmm8, %xmm4 +; SSE41-NEXT: xorpd %xmm10, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v8i64_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9 +; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2 +; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_packus_v8i64_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512-NEXT: retq + %1 = icmp slt <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> + %3 = icmp sgt <8 x i64> %2, zeroinitializer + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer + %5 = trunc <8 x i64> %4 to <8 x i32> + ret <8 x i32> %5 +} + +; +; PACKUS saturation truncation to vXi16 +; + +define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) { +; SSE2-LABEL: trunc_packus_v8i64_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm10, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm10, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_packus_v8i64_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSSE3-NEXT: pxor %xmm10, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm6 +; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: movdqa %xmm6, %xmm2 +; SSSE3-NEXT: pxor %xmm10, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pxor %xmm10, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pand %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v8i64_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm9 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535] +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: xorpd %xmm10, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm4 +; SSE41-NEXT: movapd %xmm6, %xmm1 +; SSE41-NEXT: xorpd %xmm10, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: packusdw %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: xorpd %xmm10, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movapd %xmm8, %xmm2 +; SSE41-NEXT: xorpd %xmm10, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 +; SSE41-NEXT: packusdw %xmm4, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9 +; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2 +; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v8i64_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_packus_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <8 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535> + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535> + %3 = icmp sgt <8 x i64> %2, zeroinitializer + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer + %5 = trunc <8 x i64> %4 to <8 x i16> + ret <8 x i16> %5 +} + +define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) { +; SSE2-LABEL: trunc_packus_v8i32_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_packus_v8i32_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm2, %xmm1 +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v8i32_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_packus_v8i32_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v8i32_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v8i32_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <8 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> + %3 = icmp sgt <8 x i32> %2, zeroinitializer + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer + %5 = trunc <8 x i32> %4 to <8 x i16> + ret <8 x i16> %5 +} + +define <16 x i16> @trunc_packus_v16i32_v16i16(<16 x i32> %a0) { +; SSE2-LABEL: trunc_packus_v16i32_v16i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm6, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pslld $16, %xmm5 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm5, %xmm0 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_packus_v16i32_v16i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] +; SSSE3-NEXT: movdqa %xmm6, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm6, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm6, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pandn %xmm6, %xmm5 +; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pandn %xmm6, %xmm0 +; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: movdqa %xmm6, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm6, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: pslld $16, %xmm5 +; SSSE3-NEXT: psrad $16, %xmm5 +; SSSE3-NEXT: pslld $16, %xmm0 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: packssdw %xmm5, %xmm0 +; SSSE3-NEXT: pslld $16, %xmm3 +; SSSE3-NEXT: psrad $16, %xmm3 +; SSSE3-NEXT: pslld $16, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: packssdw %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v16i32_v16i16: +; SSE41: # %bb.0: +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v16i32_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v16i32_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_packus_v16i32_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512-NEXT: retq + %1 = icmp slt <16 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> + %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> + %3 = icmp sgt <16 x i32> %2, zeroinitializer + %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer + %5 = trunc <16 x i32> %4 to <16 x i16> + ret <16 x i16> %5 +} + +; +; PACKUS saturation truncation to v16i8 +; + +define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) { +; SSE2-LABEL: trunc_packus_v8i64_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm10, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm10, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_packus_v8i64_v8i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm10, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm10, %xmm2 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm10, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v8i64_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm9 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: xorpd %xmm10, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm4 +; SSE41-NEXT: movapd %xmm6, %xmm1 +; SSE41-NEXT: xorpd %xmm10, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: packusdw %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: xorpd %xmm10, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movapd %xmm8, %xmm2 +; SSE41-NEXT: xorpd %xmm10, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 +; SSE41-NEXT: packusdw %xmm4, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v8i64_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9 +; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2 +; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v8i64_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_packus_v8i64_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> + %3 = icmp sgt <8 x i64> %2, zeroinitializer + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer + %5 = trunc <8 x i64> %4 to <8 x i8> + ret <8 x i8> %5 +} + +define void @trunc_packus_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) { +; SSE2-LABEL: trunc_packus_v8i64_v8i8_store: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm10, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm10, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_packus_v8i64_v8i8_store: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm10, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm10, %xmm2 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm10, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSSE3-NEXT: movq %xmm0, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v8i64_v8i8_store: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm9 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: xorpd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm7, %xmm4 +; SSE41-NEXT: xorpd %xmm10, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm5 +; SSE41-NEXT: movapd %xmm3, %xmm4 +; SSE41-NEXT: xorpd %xmm10, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: packusdw %xmm5, %xmm4 +; SSE41-NEXT: movapd %xmm2, %xmm3 +; SSE41-NEXT: xorpd %xmm10, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: movapd %xmm8, %xmm2 +; SSE41-NEXT: xorpd %xmm10, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: packusdw %xmm3, %xmm1 +; SSE41-NEXT: packusdw %xmm1, %xmm4 +; SSE41-NEXT: packuswb %xmm0, %xmm4 +; SSE41-NEXT: movq %xmm4, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v8i64_v8i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9 +; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2 +; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v8i64_v8i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_packus_v8i64_v8i8_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovusqb %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> + %3 = icmp sgt <8 x i64> %2, zeroinitializer + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer + %5 = trunc <8 x i64> %4 to <8 x i8> + store <8 x i8> %5, <8 x i8> *%p1 + ret void +} + +define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) { +; SSE2-LABEL: trunc_packus_v16i64_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pandn %xmm10, %xmm9 +; SSE2-NEXT: por %xmm6, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm12 +; SSE2-NEXT: pand %xmm12, %xmm7 +; SSE2-NEXT: pandn %xmm10, %xmm12 +; SSE2-NEXT: por %xmm7, %xmm12 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm11, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm13 +; SSE2-NEXT: pand %xmm13, %xmm4 +; SSE2-NEXT: pandn %xmm10, %xmm13 +; SSE2-NEXT: por %xmm4, %xmm13 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm14 +; SSE2-NEXT: pand %xmm14, %xmm5 +; SSE2-NEXT: pandn %xmm10, %xmm14 +; SSE2-NEXT: por %xmm5, %xmm14 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm10, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm11, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm10, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm11, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm10, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm11, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm10, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm14, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm14, %xmm2 +; SSE2-NEXT: movdqa %xmm13, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm13, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm12, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm12, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm9, %xmm4 +; SSE2-NEXT: packuswb %xmm3, %xmm4 +; SSE2-NEXT: packuswb %xmm4, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_packus_v16i64_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pxor %xmm8, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm11, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm9 +; SSSE3-NEXT: pand %xmm9, %xmm6 +; SSSE3-NEXT: pandn %xmm10, %xmm9 +; SSSE3-NEXT: por %xmm6, %xmm9 +; SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm11, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm12 +; SSSE3-NEXT: pand %xmm12, %xmm7 +; SSSE3-NEXT: pandn %xmm10, %xmm12 +; SSSE3-NEXT: por %xmm7, %xmm12 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm11, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm13 +; SSSE3-NEXT: pand %xmm13, %xmm4 +; SSSE3-NEXT: pandn %xmm10, %xmm13 +; SSSE3-NEXT: por %xmm4, %xmm13 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: movdqa %xmm11, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm14 +; SSSE3-NEXT: pand %xmm14, %xmm5 +; SSSE3-NEXT: pandn %xmm10, %xmm14 +; SSSE3-NEXT: por %xmm5, %xmm14 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: movdqa %xmm11, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm10, %xmm5 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm11, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm10, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm8, %xmm3 +; SSSE3-NEXT: movdqa %xmm11, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm10, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm11, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm10, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm14, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm14, %xmm2 +; SSSE3-NEXT: movdqa %xmm13, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pand %xmm13, %xmm1 +; SSSE3-NEXT: packuswb %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm12, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm12, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm4 +; SSSE3-NEXT: pand %xmm9, %xmm4 +; SSSE3-NEXT: packuswb %xmm3, %xmm4 +; SSSE3-NEXT: packuswb %xmm4, %xmm1 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v16i64_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movapd {{.*#+}} xmm11 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm12 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm12, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: movdqa %xmm12, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm10 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm13 +; SSE41-NEXT: movdqa %xmm12, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm13, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm13 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm13 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm12, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm14 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm14 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm12, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm15 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm15 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm12, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm12, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm12, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm11, %xmm1 +; SSE41-NEXT: xorpd %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm7 +; SSE41-NEXT: movapd %xmm3, %xmm1 +; SSE41-NEXT: xorpd %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: packusdw %xmm7, %xmm1 +; SSE41-NEXT: movapd %xmm6, %xmm3 +; SSE41-NEXT: xorpd %xmm9, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; SSE41-NEXT: movapd %xmm5, %xmm4 +; SSE41-NEXT: xorpd %xmm9, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; SSE41-NEXT: packusdw %xmm3, %xmm4 +; SSE41-NEXT: packusdw %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm15, %xmm3 +; SSE41-NEXT: xorpd %xmm9, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm4 +; SSE41-NEXT: movapd %xmm14, %xmm3 +; SSE41-NEXT: xorpd %xmm9, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm3 +; SSE41-NEXT: packusdw %xmm4, %xmm3 +; SSE41-NEXT: movapd %xmm13, %xmm4 +; SSE41-NEXT: xorpd %xmm9, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm4 +; SSE41-NEXT: movapd %xmm10, %xmm5 +; SSE41-NEXT: xorpd %xmm9, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2 +; SSE41-NEXT: packusdw %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm3 +; SSE41-NEXT: packuswb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm5, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm5, %xmm10 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm11 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm7, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm9, %xmm5, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm9, %xmm5, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm0 +; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm5, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm5, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm8, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm4 +; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm2 +; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm11, %xmm2 +; AVX1-NEXT: vpand %xmm11, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm10, %xmm3 +; AVX1-NEXT: vpand %xmm10, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v16i64_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpand %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5 +; AVX2-NEXT: vpand %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm1 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm3 +; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_packus_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <16 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> + %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> + %3 = icmp sgt <16 x i64> %2, zeroinitializer + %4 = select <16 x i1> %3, <16 x i64> %2, <16 x i64> zeroinitializer + %5 = trunc <16 x i64> %4 to <16 x i8> + ret <16 x i8> %5 +} + +define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { +; SSE-LABEL: trunc_packus_v8i32_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v8i32_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v8i32_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_packus_v8i32_v8i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v8i32_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v8i32_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %3 = icmp sgt <8 x i32> %2, zeroinitializer + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer + %5 = trunc <8 x i32> %4 to <8 x i8> + ret <8 x i8> %5 +} + +define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) { +; SSE-LABEL: trunc_packus_v8i32_v8i8_store: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movq %xmm0, (%rdi) +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v8i32_v8i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v8i32_v8i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_packus_v8i32_v8i8_store: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, (%rdi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v8i32_v8i8_store: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v8i32_v8i8_store: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8_store: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %3 = icmp sgt <8 x i32> %2, zeroinitializer + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer + %5 = trunc <8 x i32> %4 to <8 x i8> + store <8 x i8> %5, <8 x i8> *%p1 + ret void +} + +define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32> %a0) { +; SSE-LABEL: trunc_packus_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_packus_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <16 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %3 = icmp sgt <16 x i32> %2, zeroinitializer + %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer + %5 = trunc <16 x i32> %4 to <16 x i8> + ret <16 x i8> %5 +} + +define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) { +; SSE-LABEL: trunc_packus_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_packus_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v16i16_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v16i16_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <16 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %3 = icmp sgt <16 x i16> %2, zeroinitializer + %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer + %5 = trunc <16 x i16> %4 to <16 x i8> + ret <16 x i8> %5 +} + +define <32 x i8> @trunc_packus_v32i16_v32i8(<32 x i16> %a0) { +; SSE-LABEL: trunc_packus_v32i16_v32i8: +; SSE: # %bb.0: +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v32i16_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v32i16_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_packus_v32i16_v32i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v32i16_v32i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpminsw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpminsw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v32i16_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v32i16_v32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BWVL-NEXT: retq + %1 = icmp slt <32 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %3 = icmp sgt <32 x i16> %2, zeroinitializer + %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer + %5 = trunc <32 x i16> %4 to <32 x i8> + ret <32 x i8> %5 +} diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll new file mode 100644 index 00000000000..4d32267b61e --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll @@ -0,0 +1,3050 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL + +; +; Signed saturation truncation to vXi32 +; + +define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { +; SSE2-LABEL: trunc_ssat_v4i64_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_ssat_v4i64_v4i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320] +; SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pandn %xmm1, %xmm6 +; SSSE3-NEXT: por %xmm4, %xmm6 +; SSSE3-NEXT: pxor %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pandn %xmm1, %xmm0 +; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v4i64_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm4 = [2147483647,2147483647] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; SSE41-NEXT: movapd %xmm4, %xmm2 +; SSE41-NEXT: xorpd %xmm3, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: xorpd %xmm5, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_ssat_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_ssat_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_ssat_v4i64_v4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v4i64_v4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovsqd %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v4i64_v4i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v4i64_v4i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovsqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <4 x i64> %a0, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> + %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> + %3 = icmp sgt <4 x i64> %2, <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648> + %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648> + %5 = trunc <4 x i64> %4 to <4 x i32> + ret <4 x i32> %5 +} + + +define <8 x i32> @trunc_ssat_v8i64_v8i32(<8 x i64> %a0) { +; SSE2-LABEL: trunc_ssat_v8i64_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: por %xmm2, %xmm7 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968] +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm6 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm7 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_ssat_v8i64_v8i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm2, %xmm7 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm6 +; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968] +; SSSE3-NEXT: movdqa %xmm6, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm6 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm7, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm7 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm5 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v8i64_v8i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm9 +; SSE41-NEXT: movapd {{.*#+}} xmm10 = [2147483647,2147483647] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm10, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm8 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm10, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm10, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; SSE41-NEXT: movapd %xmm10, %xmm1 +; SSE41-NEXT: xorpd %xmm5, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [18446744069414584320,18446744069414584320] +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm4 +; SSE41-NEXT: movapd %xmm6, %xmm1 +; SSE41-NEXT: xorpd %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; SSE41-NEXT: movapd %xmm9, %xmm4 +; SSE41-NEXT: xorpd %xmm5, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4 +; SSE41-NEXT: xorpd %xmm8, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v8i64_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483647,2147483647] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9 +; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_ssat_v8i64_v8i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_ssat_v8i64_v8i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_ssat_v8i64_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512-NEXT: retq + %1 = icmp slt <8 x i64> %a0, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> + %3 = icmp sgt <8 x i64> %2, <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648> + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648> + %5 = trunc <8 x i64> %4 to <8 x i32> + ret <8 x i32> %5 +} + +; +; Signed saturation truncation to vXi16 +; + +define <8 x i16> @trunc_ssat_v8i64_v8i16(<8 x i64> %a0) { +; SSE2-LABEL: trunc_ssat_v8i64_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm7 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_ssat_v8i64_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147516415,2147516415] +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848] +; SSSE3-NEXT: movdqa %xmm7, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm7 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm5 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: packssdw %xmm3, %xmm1 +; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v8i64_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm10 +; SSE41-NEXT: movapd {{.*#+}} xmm11 = [32767,32767] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147516415,2147516415] +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9 +; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; SSE41-NEXT: movapd %xmm11, %xmm1 +; SSE41-NEXT: xorpd %xmm5, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200] +; SSE41-NEXT: movapd %xmm1, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm6 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: xorpd %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: packssdw %xmm6, %xmm1 +; SSE41-NEXT: movapd %xmm9, %xmm2 +; SSE41-NEXT: xorpd %xmm5, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm2 +; SSE41-NEXT: xorpd %xmm8, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 +; SSE41-NEXT: packssdw %xmm2, %xmm3 +; SSE41-NEXT: packssdw %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9 +; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v8i64_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [32767,32767,32767,32767] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_ssat_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <8 x i64> %a0, <i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767> + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767> + %3 = icmp sgt <8 x i64> %2, <i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768> + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> <i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768> + %5 = trunc <8 x i64> %4 to <8 x i16> + ret <8 x i16> %5 +} + +define <8 x i16> @trunc_ssat_v8i32_v8i16(<8 x i32> %a0) { +; SSE-LABEL: trunc_ssat_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_ssat_v8i32_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v8i32_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovsdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v8i32_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovsdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <8 x i32> %a0, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %3 = icmp sgt <8 x i32> %2, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %5 = trunc <8 x i32> %4 to <8 x i16> + ret <8 x i16> %5 +} + +define <16 x i16> @trunc_ssat_v16i32_v16i16(<16 x i32> %a0) { +; SSE-LABEL: trunc_ssat_v16i32_v16i16: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v16i32_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v16i32_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_ssat_v16i32_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsdw %zmm0, %ymm0 +; AVX512-NEXT: retq + %1 = icmp slt <16 x i32> %a0, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %3 = icmp sgt <16 x i32> %2, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %5 = trunc <16 x i32> %4 to <16 x i16> + ret <16 x i16> %5 +} + +; +; Signed saturation truncation to v16i8 +; + +define <8 x i8> @trunc_ssat_v8i64_v8i8(<8 x i64> %a0) { +; SSE2-LABEL: trunc_ssat_v8i64_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: por %xmm0, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: por %xmm2, %xmm7 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm7, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_ssat_v8i64_v8i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm0, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] +; SSSE3-NEXT: movdqa %xmm7, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm7 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm2, %xmm7 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm5 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm3, %xmm7 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: packuswb %xmm7, %xmm0 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v8i64_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movapd {{.*#+}} xmm11 = [127,127] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm10 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] +; SSE41-NEXT: movapd %xmm11, %xmm1 +; SSE41-NEXT: xorpd %xmm5, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840] +; SSE41-NEXT: movapd %xmm1, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1 +; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: xorpd %xmm5, %xmm6 +; SSE41-NEXT: movapd %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE41-NEXT: movapd %xmm10, %xmm3 +; SSE41-NEXT: xorpd %xmm5, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 +; SSE41-NEXT: xorpd %xmm9, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE41-NEXT: andpd %xmm0, %xmm2 +; SSE41-NEXT: andpd %xmm0, %xmm3 +; SSE41-NEXT: packusdw %xmm2, %xmm3 +; SSE41-NEXT: andpd %xmm0, %xmm7 +; SSE41-NEXT: andpd %xmm0, %xmm1 +; SSE41-NEXT: packusdw %xmm7, %xmm1 +; SSE41-NEXT: packusdw %xmm3, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v8i64_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [127,127,127,127] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 +; AVX1-NEXT: vblendvpd %ymm7, %ymm1, %ymm8, %ymm9 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm10 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm11 +; AVX1-NEXT: vblendvpd %ymm11, %ymm0, %ymm8, %ymm8 +; AVX1-NEXT: vmovapd {{.*#+}} ymm11 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm10, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm0, %ymm8, %ymm11, %ymm0 +; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vblendvpd %ymm1, %ymm9, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255] +; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v8i64_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [127,127,127,127] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_ssat_v8i64_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <8 x i64> %a0, <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127> + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127> + %3 = icmp sgt <8 x i64> %2, <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128> + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128> + %5 = trunc <8 x i64> %4 to <8 x i8> + ret <8 x i8> %5 +} + +; TODO: The AVX1 codegen shows a missed opportunity to narrow blendv+logic to 128-bit. + +define void @trunc_ssat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) { +; SSE2-LABEL: trunc_ssat_v8i64_v8i8_store: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: por %xmm0, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: por %xmm2, %xmm7 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm7, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_ssat_v8i64_v8i8_store: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm0, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] +; SSSE3-NEXT: movdqa %xmm7, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm7 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm2, %xmm7 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm5 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm3, %xmm7 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: packuswb %xmm7, %xmm0 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSSE3-NEXT: movq %xmm0, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v8i64_v8i8_store: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movapd {{.*#+}} xmm11 = [127,127] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm10 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 +; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; SSE41-NEXT: movapd %xmm11, %xmm2 +; SSE41-NEXT: xorpd %xmm5, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840] +; SSE41-NEXT: movapd %xmm2, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm2 +; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: xorpd %xmm5, %xmm6 +; SSE41-NEXT: movapd %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE41-NEXT: movapd %xmm10, %xmm3 +; SSE41-NEXT: xorpd %xmm5, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 +; SSE41-NEXT: xorpd %xmm9, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE41-NEXT: andpd %xmm0, %xmm1 +; SSE41-NEXT: andpd %xmm0, %xmm3 +; SSE41-NEXT: packusdw %xmm1, %xmm3 +; SSE41-NEXT: andpd %xmm0, %xmm7 +; SSE41-NEXT: andpd %xmm0, %xmm2 +; SSE41-NEXT: packusdw %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: packuswb %xmm0, %xmm2 +; SSE41-NEXT: movq %xmm2, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v8i64_v8i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [127,127,127,127] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 +; AVX1-NEXT: vblendvpd %ymm7, %ymm1, %ymm8, %ymm9 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm10 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm11 +; AVX1-NEXT: vblendvpd %ymm11, %ymm0, %ymm8, %ymm8 +; AVX1-NEXT: vmovapd {{.*#+}} ymm11 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm10, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm0, %ymm8, %ymm11, %ymm0 +; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vblendvpd %ymm1, %ymm9, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255] +; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v8i64_v8i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [127,127,127,127] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_ssat_v8i64_v8i8_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsqb %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <8 x i64> %a0, <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127> + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127> + %3 = icmp sgt <8 x i64> %2, <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128> + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128> + %5 = trunc <8 x i64> %4 to <8 x i8> + store <8 x i8> %5, <8 x i8> *%p1 + ret void +} + +define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) { +; SSE2-LABEL: trunc_ssat_v16i64_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [127,127] +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pandn %xmm10, %xmm9 +; SSE2-NEXT: por %xmm6, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm12 +; SSE2-NEXT: pand %xmm12, %xmm7 +; SSE2-NEXT: pandn %xmm10, %xmm12 +; SSE2-NEXT: por %xmm7, %xmm12 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm11, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm13 +; SSE2-NEXT: pand %xmm13, %xmm4 +; SSE2-NEXT: pandn %xmm10, %xmm13 +; SSE2-NEXT: por %xmm4, %xmm13 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm14 +; SSE2-NEXT: pand %xmm14, %xmm5 +; SSE2-NEXT: pandn %xmm10, %xmm14 +; SSE2-NEXT: por %xmm5, %xmm14 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm10, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm11, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pandn %xmm10, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm11, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm10, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm11, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm10, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm10, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm10, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm6 +; SSE2-NEXT: pandn %xmm10, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pandn %xmm10, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: packssdw %xmm2, %xmm3 +; SSE2-NEXT: packssdw %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm14, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm14 +; SSE2-NEXT: pandn %xmm10, %xmm2 +; SSE2-NEXT: por %xmm14, %xmm2 +; SSE2-NEXT: movdqa %xmm13, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm13 +; SSE2-NEXT: pandn %xmm10, %xmm3 +; SSE2-NEXT: por %xmm13, %xmm3 +; SSE2-NEXT: packssdw %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm12 +; SSE2-NEXT: pandn %xmm10, %xmm2 +; SSE2-NEXT: por %xmm12, %xmm2 +; SSE2-NEXT: pxor %xmm9, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm9 +; SSE2-NEXT: pandn %xmm10, %xmm1 +; SSE2-NEXT: por %xmm9, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: packssdw %xmm1, %xmm3 +; SSE2-NEXT: packsswb %xmm3, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_ssat_v16i64_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [127,127] +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pxor %xmm8, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483775,2147483775] +; SSSE3-NEXT: movdqa %xmm11, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm9 +; SSSE3-NEXT: pand %xmm9, %xmm6 +; SSSE3-NEXT: pandn %xmm10, %xmm9 +; SSSE3-NEXT: por %xmm6, %xmm9 +; SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm11, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm12 +; SSSE3-NEXT: pand %xmm12, %xmm7 +; SSSE3-NEXT: pandn %xmm10, %xmm12 +; SSSE3-NEXT: por %xmm7, %xmm12 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm11, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm13 +; SSSE3-NEXT: pand %xmm13, %xmm4 +; SSSE3-NEXT: pandn %xmm10, %xmm13 +; SSSE3-NEXT: por %xmm4, %xmm13 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: movdqa %xmm11, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm14 +; SSSE3-NEXT: pand %xmm14, %xmm5 +; SSSE3-NEXT: pandn %xmm10, %xmm14 +; SSSE3-NEXT: por %xmm5, %xmm14 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: movdqa %xmm11, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm10, %xmm5 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm11, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pandn %xmm10, %xmm6 +; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm11, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm10, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm11, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm10, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [18446744073709551488,18446744073709551488] +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm4 +; SSSE3-NEXT: pandn %xmm10, %xmm1 +; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pandn %xmm10, %xmm0 +; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm6, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm6 +; SSSE3-NEXT: pandn %xmm10, %xmm2 +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm5 +; SSSE3-NEXT: pandn %xmm10, %xmm3 +; SSSE3-NEXT: por %xmm5, %xmm3 +; SSSE3-NEXT: packssdw %xmm2, %xmm3 +; SSSE3-NEXT: packssdw %xmm3, %xmm0 +; SSSE3-NEXT: movdqa %xmm14, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm14 +; SSSE3-NEXT: pandn %xmm10, %xmm2 +; SSSE3-NEXT: por %xmm14, %xmm2 +; SSSE3-NEXT: movdqa %xmm13, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm13 +; SSSE3-NEXT: pandn %xmm10, %xmm3 +; SSSE3-NEXT: por %xmm13, %xmm3 +; SSSE3-NEXT: packssdw %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm12, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm12 +; SSSE3-NEXT: pandn %xmm10, %xmm2 +; SSSE3-NEXT: por %xmm12, %xmm2 +; SSSE3-NEXT: pxor %xmm9, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm9 +; SSSE3-NEXT: pandn %xmm10, %xmm1 +; SSSE3-NEXT: por %xmm9, %xmm1 +; SSSE3-NEXT: packssdw %xmm2, %xmm1 +; SSSE3-NEXT: packssdw %xmm1, %xmm3 +; SSSE3-NEXT: packsswb %xmm3, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v16i64_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movapd {{.*#+}} xmm11 = [127,127] +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm12 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm12, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: movdqa %xmm12, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm10 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm13 +; SSE41-NEXT: movdqa %xmm12, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm13, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm13 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm13 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm12, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm14 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm14 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm12, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm15 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm15 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm12, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm12, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm12, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] +; SSE41-NEXT: movapd %xmm11, %xmm1 +; SSE41-NEXT: xorpd %xmm9, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840] +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: xorpd %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: packssdw %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm6, %xmm3 +; SSE41-NEXT: xorpd %xmm9, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; SSE41-NEXT: movapd %xmm5, %xmm4 +; SSE41-NEXT: xorpd %xmm9, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; SSE41-NEXT: packssdw %xmm3, %xmm4 +; SSE41-NEXT: packssdw %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm15, %xmm3 +; SSE41-NEXT: xorpd %xmm9, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm3 +; SSE41-NEXT: movapd %xmm14, %xmm4 +; SSE41-NEXT: xorpd %xmm9, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm4 +; SSE41-NEXT: packssdw %xmm3, %xmm4 +; SSE41-NEXT: movapd %xmm13, %xmm3 +; SSE41-NEXT: xorpd %xmm9, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm3 +; SSE41-NEXT: xorpd %xmm10, %xmm9 +; SSE41-NEXT: movapd %xmm9, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: packssdw %xmm2, %xmm4 +; SSE41-NEXT: packsswb %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [127,127] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm5, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm5, %xmm10 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm11 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm7, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm9, %xmm5, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm9, %xmm5, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm0 +; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm5, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm5, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm8, %xmm5, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm5, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm0, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm7, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm11, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm11, %xmm5, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm10, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm10, %xmm5, %xmm3 +; AVX1-NEXT: vpackssdw %xmm8, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm7, %xmm3, %xmm2 +; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v16i64_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [127,127,127,127] +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm1 +; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm4, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_ssat_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127] +; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <16 x i64> %a0, <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127> + %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127> + %3 = icmp sgt <16 x i64> %2, <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128> + %4 = select <16 x i1> %3, <16 x i64> %2, <16 x i64> <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128> + %5 = trunc <16 x i64> %4 to <16 x i8> + ret <16 x i8> %5 +} + +define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) { +; SSE-LABEL: trunc_ssat_v8i32_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v8i32_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v8i32_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_ssat_v8i32_v8i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v8i32_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v8i32_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <8 x i32> %a0, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127> + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127> + %3 = icmp sgt <8 x i32> %2, <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128> + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128> + %5 = trunc <8 x i32> %4 to <8 x i8> + ret <8 x i8> %5 +} + +define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) { +; SSE-LABEL: trunc_ssat_v8i32_v8i8_store: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: movq %xmm0, (%rdi) +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v8i32_v8i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v8i32_v8i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_ssat_v8i32_v8i8_store: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, (%rdi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v8i32_v8i8_store: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovsdb %ymm0, (%rdi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v8i32_v8i8_store: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i8_store: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovsdb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <8 x i32> %a0, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127> + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127> + %3 = icmp sgt <8 x i32> %2, <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128> + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128> + %5 = trunc <8 x i32> %4 to <8 x i8> + store <8 x i8> %5, <8 x i8> *%p1 + ret void +} + +define <16 x i8> @trunc_ssat_v16i32_v16i8(<16 x i32> %a0) { +; SSE-LABEL: trunc_ssat_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_ssat_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <16 x i32> %a0, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127> + %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127> + %3 = icmp sgt <16 x i32> %2, <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128> + %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128> + %5 = trunc <16 x i32> %4 to <16 x i8> + ret <16 x i8> %5 +} + +define <16 x i8> @trunc_ssat_v16i16_v16i8(<16 x i16> %a0) { +; SSE-LABEL: trunc_ssat_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_ssat_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v16i16_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v16i16_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovswb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <16 x i16> %a0, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> + %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> + %3 = icmp sgt <16 x i16> %2, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> + %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> + %5 = trunc <16 x i16> %4 to <16 x i8> + ret <16 x i8> %5 +} + +define <32 x i8> @trunc_ssat_v32i16_v32i8(<32 x i16> %a0) { +; SSE-LABEL: trunc_ssat_v32i16_v32i8: +; SSE: # %bb.0: +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v32i16_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v32i16_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_ssat_v32i16_v32i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v32i16_v32i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpminsw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpminsw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] +; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v32i16_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovswb %zmm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v32i16_v32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovswb %zmm0, %ymm0 +; AVX512BWVL-NEXT: retq + %1 = icmp slt <32 x i16> %a0, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> + %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> + %3 = icmp sgt <32 x i16> %2, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> + %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> + %5 = trunc <32 x i16> %4 to <32 x i8> + ret <32 x i8> %5 +} diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll new file mode 100644 index 00000000000..fd76cb53c6e --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll @@ -0,0 +1,2430 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL + +; +; Unsigned saturation truncation to vXi32 +; + +define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) { +; SSE2-LABEL: trunc_usat_v4i64_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm5 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v4i64_v4i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm4, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm3 +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm5 +; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v4i64_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: por %xmm6, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] +; SSE41-NEXT: movapd {{.*#+}} xmm5 = [4294967295,429496729] +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2] +; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpxor %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4294967295,429496729] +; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm3, %xmm1 +; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4294967295,4294967295] +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_usat_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-SLOW-NEXT: vpxor %ymm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] +; AVX2-SLOW-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_usat_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-FAST-NEXT: vpxor %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] +; AVX2-FAST-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_usat_v4i64_v4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] +; AVX512F-NEXT: vpcmpltuq %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vpmovqd %zmm1, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_usat_v4i64_v4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpltuq {{.*}}(%rip){1to4}, %ymm0, %k1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] +; AVX512VL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; AVX512VL-NEXT: vpmovqd %ymm1, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_usat_v4i64_v4i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] +; AVX512BW-NEXT: vpcmpltuq %zmm1, %zmm0, %k1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_usat_v4i64_v4i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpcmpltuq {{.*}}(%rip){1to4}, %ymm0, %k1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] +; AVX512BWVL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp ult <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> + %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 429496729> + %3 = trunc <4 x i64> %2 to <4 x i32> + ret <4 x i32> %3 +} + +define <8 x i32> @trunc_usat_v8i64_v8i32(<8 x i64> %a0) { +; SSE2-LABEL: trunc_usat_v8i64_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v8i64_v8i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm3, %xmm7 +; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm9, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm2, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; SSSE3-NEXT: movaps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v8i64_v8i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm8, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9 +; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2] +; SSE41-NEXT: movaps %xmm9, %xmm0 +; SSE41-NEXT: movaps %xmm3, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v8i64_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpxor %xmm2, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4294967295,4294967295] +; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_usat_v8i64_v8i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_usat_v8i64_v8i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpxor %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_usat_v8i64_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512-NEXT: retq + %1 = icmp ult <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> + %3 = trunc <8 x i64> %2 to <8 x i32> + ret <8 x i32> %3 +} + +; +; Unsigned saturation truncation to vXi16 +; + +define <8 x i16> @trunc_usat_v8i64_v8i16(<8 x i64> %a0) { +; SSE2-LABEL: trunc_usat_v8i64_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v8i64_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm6, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991] +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm1, %xmm6 +; SSSE3-NEXT: movdqa %xmm9, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v8i64_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movapd {{.*#+}} xmm9 = [65535,65535] +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: packusdw %xmm5, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm2, %xmm7 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm7, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 +; SSE41-NEXT: packusdw %xmm5, %xmm9 +; SSE41-NEXT: packusdw %xmm9, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpxor %xmm2, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [65535,65535] +; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v8i64_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_usat_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp ult <8 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535> + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535> + %3 = trunc <8 x i64> %2 to <8 x i16> + ret <8 x i16> %3 +} + +define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) { +; SSE2-LABEL: trunc_usat_v8i32_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pslld $16, %xmm4 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v8i32_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm2, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm1, %xmm5 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v8i32_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE41-NEXT: pminud %xmm2, %xmm1 +; SSE41-NEXT: pminud %xmm2, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_usat_v8i32_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_usat_v8i32_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_usat_v8i32_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp ult <8 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> + %3 = trunc <8 x i32> %2 to <8 x i16> + ret <8 x i16> %3 +} + +define <16 x i16> @trunc_usat_v16i32_v16i16(<16 x i32> %a0) { +; SSE2-LABEL: trunc_usat_v16i32_v16i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm7, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm7, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: por %xmm7, %xmm5 +; SSE2-NEXT: pslld $16, %xmm5 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm5, %xmm0 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v16i32_v16i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm1, %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm2, %xmm7 +; SSSE3-NEXT: pxor %xmm6, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm7, %xmm1 +; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm6, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm7, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm7, %xmm4 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 +; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm7, %xmm5 +; SSSE3-NEXT: pslld $16, %xmm5 +; SSSE3-NEXT: psrad $16, %xmm5 +; SSSE3-NEXT: pslld $16, %xmm0 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: packssdw %xmm5, %xmm0 +; SSSE3-NEXT: pslld $16, %xmm2 +; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: pslld $16, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: packssdw %xmm2, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v16i32_v16i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] +; SSE41-NEXT: pminud %xmm4, %xmm3 +; SSE41-NEXT: pminud %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pminud %xmm4, %xmm1 +; SSE41-NEXT: pminud %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v16i32_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v16i32_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_usat_v16i32_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512-NEXT: retq + %1 = icmp ult <16 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> + %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> + %3 = trunc <16 x i32> %2 to <16 x i16> + ret <16 x i16> %3 +} + +; +; Unsigned saturation truncation to v16i8 +; + +define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64> %a0) { +; SSE2-LABEL: trunc_usat_v8i64_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm4, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v8i64_v8i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm4 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: packuswb %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm6, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm6 +; SSSE3-NEXT: movdqa %xmm9, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: packuswb %xmm4, %xmm1 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v8i64_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movapd {{.*#+}} xmm9 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: packusdw %xmm5, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm2, %xmm7 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm7, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 +; SSE41-NEXT: packusdw %xmm5, %xmm9 +; SSE41-NEXT: packusdw %xmm9, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v8i64_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpxor %xmm2, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v8i64_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_usat_v8i64_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp ult <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> + %3 = trunc <8 x i64> %2 to <8 x i8> + ret <8 x i8> %3 +} + +define void @trunc_usat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) { +; SSE2-LABEL: trunc_usat_v8i64_v8i8_store: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v8i64_v8i8_store: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm1, %xmm7 +; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm5, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: packuswb %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm9, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: packuswb %xmm4, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm1 +; SSSE3-NEXT: packuswb %xmm0, %xmm1 +; SSSE3-NEXT: movq %xmm1, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v8i64_v8i8_store: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movapd {{.*#+}} xmm9 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: packusdw %xmm5, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm2, %xmm7 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm7, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 +; SSE41-NEXT: packusdw %xmm5, %xmm9 +; SSE41-NEXT: packusdw %xmm9, %xmm1 +; SSE41-NEXT: packuswb %xmm0, %xmm1 +; SSE41-NEXT: movq %xmm1, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v8i64_v8i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpxor %xmm2, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v8i64_v8i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_usat_v8i64_v8i8_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovusqb %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp ult <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> + %3 = trunc <8 x i64> %2 to <8 x i8> + store <8 x i8> %3, <8 x i8> *%p1 + ret void +} + +define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) { +; SSE2-LABEL: trunc_usat_v16i64_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: pxor %xmm9, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm12 +; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm12 +; SSE2-NEXT: por %xmm1, %xmm12 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm1 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm12, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm1 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm9, %xmm3 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: packuswb %xmm1, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm1 +; SSE2-NEXT: movdqa %xmm10, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm1 +; SSE2-NEXT: movdqa %xmm10, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: pxor %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm10, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm9 +; SSE2-NEXT: movdqa %xmm10, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm6 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v16i64_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm1, %xmm11 +; SSSE3-NEXT: pxor %xmm9, %xmm11 +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm10, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm12 +; SSSE3-NEXT: pand %xmm12, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm12 +; SSSE3-NEXT: por %xmm1, %xmm12 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm9, %xmm1 +; SSSE3-NEXT: movdqa %xmm10, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: packuswb %xmm12, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm9, %xmm1 +; SSSE3-NEXT: movdqa %xmm10, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm12, %xmm13 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3] +; SSSE3-NEXT: por %xmm13, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm3, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm9, %xmm3 +; SSSE3-NEXT: movdqa %xmm10, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm12, %xmm13 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] +; SSSE3-NEXT: por %xmm13, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: packuswb %xmm1, %xmm3 +; SSSE3-NEXT: packuswb %xmm3, %xmm0 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm9, %xmm1 +; SSSE3-NEXT: movdqa %xmm10, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm5 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm9, %xmm1 +; SSSE3-NEXT: movdqa %xmm10, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm11, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm4 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: packuswb %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm7, %xmm2 +; SSSE3-NEXT: pxor %xmm9, %xmm2 +; SSSE3-NEXT: movdqa %xmm10, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm7 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm9 +; SSSE3-NEXT: movdqa %xmm10, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm6 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm1 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v16i64_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movapd {{.*#+}} xmm9 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm11, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm10, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 +; SSE41-NEXT: movdqa %xmm10, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: pand %xmm12, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm12 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm11, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm13 +; SSE41-NEXT: movdqa %xmm10, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm13, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm13 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm13 +; SSE41-NEXT: packusdw %xmm12, %xmm13 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm11, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm10, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm11, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm10, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: packusdw %xmm8, %xmm1 +; SSE41-NEXT: packusdw %xmm1, %xmm13 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm11, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm10, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm11, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm10, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm11, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm10, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: pxor %xmm6, %xmm11 +; SSE41-NEXT: movdqa %xmm10, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm11, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm11, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm9 +; SSE41-NEXT: packusdw %xmm1, %xmm9 +; SSE41-NEXT: packusdw %xmm9, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm13 +; SSE41-NEXT: movdqa %xmm13, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa %ymm0, %ymm8 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm5, %xmm8, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm11 +; AVX1-NEXT: vpxor %xmm5, %xmm11, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm10 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm14 +; AVX1-NEXT: vpxor %xmm5, %xmm14, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm12 +; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm13 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 +; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm15 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [255,255] +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm6, %xmm9 +; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vblendvpd %xmm15, %xmm7, %xmm6, %xmm4 +; AVX1-NEXT: vblendvpd %xmm13, %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vblendvpd %xmm12, %xmm14, %xmm6, %xmm5 +; AVX1-NEXT: vblendvpd %xmm10, %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vblendvpd %xmm0, %xmm11, %xmm6, %xmm7 +; AVX1-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vblendvpd %xmm0, %xmm8, %xmm6, %xmm6 +; AVX1-NEXT: vpackusdw %xmm9, %xmm3, %xmm0 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v16i64_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm6 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] +; AVX2-NEXT: vpcmpgtq %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vblendvpd %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %ymm5, %ymm0, %ymm6 +; AVX2-NEXT: vpcmpgtq %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vblendvpd %ymm6, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm5, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %ymm5, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm7, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_usat_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpminuq %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpminuq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp ult <16 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> + %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> + %3 = trunc <16 x i64> %2 to <16 x i8> + ret <16 x i8> %3 +} + +define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) { +; SSE2-LABEL: trunc_usat_v8i32_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: packuswb %xmm6, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v8i32_v8i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm2, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm1, %xmm5 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v8i32_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSE41-NEXT: pminud %xmm2, %xmm0 +; SSE41-NEXT: pminud %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 +; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v8i32_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v8i32_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_usat_v8i32_v8i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_usat_v8i32_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_usat_v8i32_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp ult <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %3 = trunc <8 x i32> %2 to <8 x i8> + ret <8 x i8> %3 +} + +define void @trunc_usat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) { +; SSE2-LABEL: trunc_usat_v8i32_v8i8_store: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: packuswb %xmm6, %xmm5 +; SSE2-NEXT: packuswb %xmm0, %xmm5 +; SSE2-NEXT: movq %xmm5, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v8i32_v8i8_store: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm2, %xmm6 +; SSSE3-NEXT: por %xmm0, %xmm6 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm0, %xmm5 +; SSSE3-NEXT: pshufb %xmm0, %xmm6 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSSE3-NEXT: movq %xmm6, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v8i32_v8i8_store: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSE41-NEXT: pminud %xmm2, %xmm0 +; SSE41-NEXT: pminud %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 +; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: movq %xmm0, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v8i32_v8i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v8i32_v8i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_usat_v8i32_v8i8_store: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, (%rdi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_usat_v8i32_v8i8_store: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_usat_v8i32_v8i8_store: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8_store: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp ult <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %3 = trunc <8 x i32> %2 to <8 x i8> + store <8 x i8> %3, <8 x i8> *%p1 + ret void +} + +define <16 x i8> @trunc_usat_v16i32_v16i8(<16 x i32> %a0) { +; SSE2-LABEL: trunc_usat_v16i32_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: packuswb %xmm4, %xmm5 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v16i32_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm1, %xmm7 +; SSSE3-NEXT: pxor %xmm6, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm6, %xmm1 +; SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: packuswb %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm6, %xmm1 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: packuswb %xmm4, %xmm5 +; SSSE3-NEXT: packuswb %xmm5, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v16i32_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255] +; SSE41-NEXT: pminud %xmm4, %xmm1 +; SSE41-NEXT: pminud %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: pminud %xmm4, %xmm3 +; SSE41-NEXT: pminud %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_usat_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp ult <16 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> + %3 = trunc <16 x i32> %2 to <16 x i8> + ret <16 x i8> %3 +} + +define <16 x i8> @trunc_usat_v16i16_v16i8(<16 x i16> %a0) { +; SSE2-LABEL: trunc_usat_v16i16_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [33023,33023,33023,33023,33023,33023,33023,33023] +; SSE2-NEXT: pminsw %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pminsw %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v16i16_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [33023,33023,33023,33023,33023,33023,33023,33023] +; SSSE3-NEXT: pminsw %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: pminsw %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v16i16_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pminuw %xmm2, %xmm1 +; SSE41-NEXT: pminuw %xmm2, %xmm0 +; SSE41-NEXT: packuswb %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_usat_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_usat_v16i16_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_usat_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_usat_v16i16_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp ult <16 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %3 = trunc <16 x i16> %2 to <16 x i8> + ret <16 x i8> %3 +} + +define <32 x i8> @trunc_usat_v32i16_v32i8(<32 x i16> %a0) { +; SSE2-LABEL: trunc_usat_v32i16_v32i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [33023,33023,33023,33023,33023,33023,33023,33023] +; SSE2-NEXT: pminsw %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pminsw %xmm5, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pminsw %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pminsw %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v32i16_v32i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [33023,33023,33023,33023,33023,33023,33023,33023] +; SSSE3-NEXT: pminsw %xmm5, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: pminsw %xmm5, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: pminsw %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: pminsw %xmm5, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v32i16_v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pminuw %xmm4, %xmm3 +; SSE41-NEXT: pminuw %xmm4, %xmm2 +; SSE41-NEXT: packuswb %xmm3, %xmm2 +; SSE41-NEXT: pminuw %xmm4, %xmm1 +; SSE41-NEXT: pminuw %xmm4, %xmm0 +; SSE41-NEXT: packuswb %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v32i16_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpminuw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminuw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpminuw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v32i16_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminuw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminuw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_usat_v32i16_v32i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminuw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpminuw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_usat_v32i16_v32i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpminuw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpminuw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_usat_v32i16_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_usat_v32i16_v32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BWVL-NEXT: retq + %1 = icmp ult <32 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + %3 = trunc <32 x i16> %2 to <32 x i8> + ret <32 x i8> %3 +} diff --git a/llvm/test/CodeGen/X86/vector-trunc-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-widen.ll new file mode 100644 index 00000000000..42bf9a7c049 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-trunc-widen.ll @@ -0,0 +1,2126 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL + +define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) { +; SSE-LABEL: trunc8i64_8i32: +; SSE: # %bb.0: # %entry +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc8i64_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc8i64_8i32: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc8i64_8i32: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc8i64_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: retq +entry: + %0 = trunc <8 x i64> %a to <8 x i32> + ret <8 x i32> %0 +} + +define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) { +; SSE-LABEL: trunc8i64_8i32_ashr: +; SSE: # %bb.0: # %entry +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc8i64_8i32_ashr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc8i64_8i32_ashr: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc8i64_8i32_ashr: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: retq +entry: + %0 = ashr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> + %1 = trunc <8 x i64> %0 to <8 x i32> + ret <8 x i32> %1 +} + +define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) { +; SSE-LABEL: trunc8i64_8i32_lshr: +; SSE: # %bb.0: # %entry +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc8i64_8i32_lshr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc8i64_8i32_lshr: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc8i64_8i32_lshr: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: retq +entry: + %0 = lshr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> + %1 = trunc <8 x i64> %0 to <8 x i32> + ret <8 x i32> %1 +} + +define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) { +; SSE2-LABEL: trunc8i64_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc8i64_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc8i64_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc8i64_8i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc8i64_8i16: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc8i64_8i16: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc8i64_8i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = trunc <8 x i64> %a to <8 x i16> + ret <8 x i16> %0 +} + +define void @trunc8i64_8i8(<8 x i64> %a) { +; SSE2-LABEL: trunc8i64_8i8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc8i64_8i8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSSE3-NEXT: movq %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc8i64_8i8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: packuswb %xmm0, %xmm0 +; SSE41-NEXT: movq %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc8i64_8i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc8i64_8i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc8i64_8i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovqb %zmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = trunc <8 x i64> %a to <8 x i8> + store <8 x i8> %0, <8 x i8>* undef, align 4 + ret void +} + +define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) { +; SSE2-LABEL: trunc8i32_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc8i32_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc8i32_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc8i32_8i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc8i32_8i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc8i32_8i16: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc8i32_8i16: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc8i32_8i16: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc8i32_8i16: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <8 x i32> %a to <8 x i16> + ret <8 x i16> %0 +} + +define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) { +; SSE-LABEL: trunc8i32_8i16_ashr: +; SSE: # %bb.0: # %entry +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc8i32_8i16_ashr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc8i32_8i16_ashr: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc8i32_8i16_ashr: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc8i32_8i16_ashr: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc8i32_8i16_ashr: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc8i32_8i16_ashr: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = ashr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + %1 = trunc <8 x i32> %0 to <8 x i16> + ret <8 x i16> %1 +} + +define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) { +; SSE2-LABEL: trunc8i32_8i16_lshr: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc8i32_8i16_lshr: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,10,11,14,15,14,15,255,255] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc8i32_8i16_lshr: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc8i32_8i16_lshr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc8i32_8i16_lshr: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc8i32_8i16_lshr: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc8i32_8i16_lshr: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc8i32_8i16_lshr: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc8i32_8i16_lshr: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + %1 = trunc <8 x i32> %0 to <8 x i16> + ret <8 x i16> %1 +} + +define void @trunc8i32_8i8(<8 x i32> %a) { +; SSE2-LABEL: trunc8i32_8i8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc8i32_8i8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movq %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc8i32_8i8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 +; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: movq %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc8i32_8i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc8i32_8i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc8i32_8i8: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc8i32_8i8: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpmovdb %ymm0, (%rax) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc8i32_8i8: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc8i32_8i8: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rax) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <8 x i32> %a to <8 x i8> + store <8 x i8> %0, <8 x i8>* undef, align 4 + ret void +} + +define void @trunc16i32_16i16(<16 x i32> %a) { +; SSE2-LABEL: trunc16i32_16i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pslld $16, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: pslld $16, %xmm0 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: pslld $16, %xmm3 +; SSSE3-NEXT: psrad $16, %xmm3 +; SSSE3-NEXT: pslld $16, %xmm2 +; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: packssdw %xmm3, %xmm2 +; SSSE3-NEXT: movdqu %xmm2, (%rax) +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: movdqu %xmm2, (%rax) +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc16i32_16i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i32_16i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqu %xmm1, (%rax) +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc16i32_16i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovdw %zmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = trunc <16 x i32> %a to <16 x i16> + store <16 x i16> %0, <16 x i16>* undef, align 4 + ret void +} + +define void @trunc16i32_16i16_ashr(<16 x i32> %a) { +; SSE-LABEL: trunc16i32_16i16_ashr: +; SSE: # %bb.0: # %entry +; SSE-NEXT: psrad $16, %xmm3 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: movdqu %xmm2, (%rax) +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc16i32_16i16_ashr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i32_16i16_ashr: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1 +; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc16i32_16i16_ashr: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + %1 = trunc <16 x i32> %0 to <16 x i16> + store <16 x i16> %1, <16 x i16>* undef, align 4 + ret void +} + +define void @trunc16i32_16i16_lshr(<16 x i32> %a) { +; SSE2-LABEL: trunc16i32_16i16_lshr: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i16_lshr: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: psrad $16, %xmm3 +; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: packssdw %xmm3, %xmm2 +; SSSE3-NEXT: movdqu %xmm2, (%rax) +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i16_lshr: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: movdqu %xmm2, (%rax) +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc16i32_16i16_lshr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i32_16i16_lshr: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc16i32_16i16_lshr: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + %1 = trunc <16 x i32> %0 to <16 x i16> + store <16 x i16> %1, <16 x i16>* undef, align 4 + ret void +} + +define void @trunc16i32_16i8(<16 x i32> %a) { +; SSE2-LABEL: trunc16i32_16i8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc16i32_16i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i32_16i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc16i32_16i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovdb %zmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = trunc <16 x i32> %a to <16 x i8> + store <16 x i8> %0, <16 x i8>* undef, align 4 + ret void +} + +define void @trunc16i32_16i8_ashr(<16 x i32> %a) { +; SSE-LABEL: trunc16i32_16i8_ashr: +; SSE: # %bb.0: # %entry +; SSE-NEXT: psrad $24, %xmm1 +; SSE-NEXT: psrad $24, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: psrad $24, %xmm3 +; SSE-NEXT: psrad $24, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc16i32_16i8_ashr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i32_16i8_ashr: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrad $24, %ymm1, %ymm1 +; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc16i32_16i8_ashr: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = ashr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> + %1 = trunc <16 x i32> %0 to <16 x i8> + store <16 x i8> %1, <16 x i8>* undef, align 4 + ret void +} + +define void @trunc16i32_16i8_lshr(<16 x i32> %a) { +; SSE2-LABEL: trunc16i32_16i8_lshr: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrld $24, %xmm1 +; SSE2-NEXT: psrld $24, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: psrld $24, %xmm3 +; SSE2-NEXT: psrld $24, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i8_lshr: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: psrld $24, %xmm1 +; SSSE3-NEXT: psrld $24, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: psrld $24, %xmm3 +; SSSE3-NEXT: psrld $24, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i8_lshr: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrld $24, %xmm1 +; SSE41-NEXT: psrld $24, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: psrld $24, %xmm3 +; SSE41-NEXT: psrld $24, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc16i32_16i8_lshr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i32_16i8_lshr: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc16i32_16i8_lshr: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = lshr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> + %1 = trunc <16 x i32> %0 to <16 x i8> + store <16 x i8> %1, <16 x i8>* undef, align 4 + ret void +} + +;PR25684 +define void @trunc16i16_16i8(<16 x i16> %a) { +; SSE2-LABEL: trunc16i16_16i8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i16_16i8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i16_16i8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc16i16_16i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i16_16i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc16i16_16i8: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc16i16_16i8: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc16i16_16i8: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc16i16_16i8: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <16 x i16> %a to <16 x i8> + store <16 x i8> %0, <16 x i8>* undef, align 4 + ret void +} + +define void @trunc16i16_16i8_ashr(<16 x i16> %a) { +; SSE-LABEL: trunc16i16_16i8_ashr: +; SSE: # %bb.0: # %entry +; SSE-NEXT: psraw $8, %xmm1 +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc16i16_16i8_ashr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i16_16i8_ashr: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc16i16_16i8_ashr: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc16i16_16i8_ashr: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc16i16_16i8_ashr: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpsraw $8, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc16i16_16i8_ashr: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = ashr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %1 = trunc <16 x i16> %0 to <16 x i8> + store <16 x i8> %1, <16 x i8>* undef, align 4 + ret void +} + +define void @trunc16i16_16i8_lshr(<16 x i16> %a) { +; SSE-LABEL: trunc16i16_16i8_lshr: +; SSE: # %bb.0: # %entry +; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc16i16_16i8_lshr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i16_16i8_lshr: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc16i16_16i8_lshr: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc16i16_16i8_lshr: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc16i16_16i8_lshr: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc16i16_16i8_lshr: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = lshr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %1 = trunc <16 x i16> %0 to <16 x i8> + store <16 x i8> %1, <16 x i8>* undef, align 4 + ret void +} + +define void @trunc32i16_32i8(<32 x i16> %a) { +; SSE2-LABEL: trunc32i16_32i8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc32i16_32i8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm4, %xmm1 +; SSSE3-NEXT: pshufb %xmm4, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: pshufb %xmm4, %xmm3 +; SSSE3-NEXT: pshufb %xmm4, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSSE3-NEXT: movdqu %xmm2, (%rax) +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc32i16_32i8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm4, %xmm1 +; SSE41-NEXT: pshufb %xmm4, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: pshufb %xmm4, %xmm3 +; SSE41-NEXT: pshufb %xmm4, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE41-NEXT: movdqu %xmm2, (%rax) +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc32i16_32i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc32i16_32i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpackuswb %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc32i16_32i8: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, (%rax) +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc32i16_32i8: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovdb %zmm1, (%rax) +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc32i16_32i8: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpmovwb %zmm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc32i16_32i8: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rax) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <32 x i16> %a to <32 x i8> + store <32 x i8> %0, <32 x i8>* undef, align 4 + ret void +} + +define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) { +; SSE-LABEL: trunc2x4i64_8i32: +; SSE: # %bb.0: # %entry +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc2x4i64_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc2x4i64_8i32: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc2x4i64_8i32: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc2x4i64_8i32: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc2x4i64_8i32: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc2x4i64_8i32: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc2x4i64_8i32: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 +; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <4 x i64> %a to <4 x i32> + %1 = trunc <4 x i64> %b to <4 x i32> + %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i32> %2 +} + +define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) { +; SSE2-LABEL: trunc2x4i64_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc2x4i64_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc2x4i64_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE41-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc2x4i64_8i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc2x4i64_8i16: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc2x4i64_8i16: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc2x4i64_8i16: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc2x4i64_8i16: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc2x4i64_8i16: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc2x4i64_8i16: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqw %ymm1, %xmm1 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <4 x i64> %a to <4 x i16> + %1 = trunc <4 x i64> %b to <4 x i16> + %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i16> %2 +} + +define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) { +; SSE-LABEL: trunc2x2i64_4i32: +; SSE: # %bb.0: # %entry +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX-LABEL: trunc2x2i64_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX-NEXT: retq +; +; AVX512-LABEL: trunc2x2i64_4i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512-NEXT: retq +entry: + %0 = trunc <2 x i64> %a to <2 x i32> + %1 = trunc <2 x i64> %b to <2 x i32> + %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i32> %2 +} + +define i64 @trunc2i64_i64(<2 x i64> %inval) { +; SSE-LABEL: trunc2i64_i64: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: retq +; +; AVX-LABEL: trunc2i64_i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq +; +; AVX512-LABEL: trunc2i64_i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: retq +entry: + %0 = trunc <2 x i64> %inval to <2 x i32> + %1 = bitcast <2 x i32> %0 to i64 + ret i64 %1 +} + +define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: trunc2x4i32_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc2x4i32_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc2x4i32_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: trunc2x4i32_8i16: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc2x4i32_8i16: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc2x4i32_8i16: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc2x4i32_8i16: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc2x4i32_8i16: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14] +; AVX512BWVL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <4 x i32> %a to <4 x i16> + %1 = trunc <4 x i32> %b to <4 x i16> + %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i16> %2 +} + +; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524 +define i64 @trunc4i32_i64(<4 x i32> %inval) { +; SSE2-LABEL: trunc4i32_i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc4i32_i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: movq %xmm0, %rax +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc4i32_i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: retq +; +; AVX-LABEL: trunc4i32_i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq +; +; AVX512-LABEL: trunc4i32_i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: retq +entry: + %0 = trunc <4 x i32> %inval to <4 x i16> + %1 = bitcast <4 x i16> %0 to i64 + ret i64 %1 +} + +define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: trunc2x8i16_16i8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc2x8i16_16i8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc2x8i16_16i8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: trunc2x8i16_16i8: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq +; +; AVX512-LABEL: trunc2x8i16_16i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: retq +entry: + %0 = trunc <8 x i16> %a to <8 x i8> + %1 = trunc <8 x i16> %b to <8 x i8> + %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %2 +} + +; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524 +define i64 @trunc8i16_i64(<8 x i16> %inval) { +; SSE2-LABEL: trunc8i16_i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc8i16_i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movq %xmm0, %rax +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc8i16_i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: retq +; +; AVX-LABEL: trunc8i16_i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq +; +; AVX512-LABEL: trunc8i16_i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: retq +entry: + %0 = trunc <8 x i16> %inval to <8 x i8> + %1 = bitcast <8 x i8> %0 to i64 + ret i64 %1 +} + +define <16 x i8> @trunc16i64_16i8_const() { +; SSE-LABEL: trunc16i64_16i8_const: +; SSE: # %bb.0: # %entry +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: trunc16i64_16i8_const: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: trunc16i64_16i8_const: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq + +entry: + %0 = trunc <16 x i64> zeroinitializer to <16 x i8> + %1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 28, i32 30, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26> + ret <16 x i8> %1 +} + +define <8 x i16> @PR32160(<8 x i32> %x) { +; SSE-LABEL: PR32160: +; SSE: # %bb.0: +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: PR32160: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: PR32160: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: PR32160: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: PR32160: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: PR32160: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: PR32160: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: PR32160: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %shuf = trunc <8 x i32> %x to <8 x i16> + %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> + ret <8 x i16> %trunc +} + +define void @PR34773(i16* %a0, i8* %a1) { +; SSE-LABEL: PR34773: +; SSE: # %bb.0: +; SSE-NEXT: movdqu (%rdi), %xmm0 +; SSE-NEXT: movdqu 16(%rdi), %xmm1 +; SSE-NEXT: movdqu 32(%rdi), %xmm2 +; SSE-NEXT: movdqu 48(%rdi), %xmm3 +; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: psrlw $8, %xmm3 +; SSE-NEXT: psrlw $8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: movdqu %xmm0, (%rsi) +; SSE-NEXT: movdqu %xmm2, 16(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: PR34773: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovdqu %xmm0, (%rsi) +; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR34773: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqu %xmm0, (%rsi) +; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: PR34773: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, 16(%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: PR34773: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, 16(%rsi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: PR34773: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) +; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: PR34773: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpsrlw $8, 32(%rdi), %ymm1 +; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) +; AVX512BWVL-NEXT: vpmovwb %ymm1, 16(%rsi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = getelementptr i16, i16* %a0, i64 16 + %2 = getelementptr i8, i8* %a1, i64 16 + %3 = bitcast i16* %a0 to <16 x i16>* + %4 = bitcast i16* %1 to <16 x i16>* + %5 = bitcast i8* %a1 to <16 x i8>* + %6 = bitcast i8* %2 to <16 x i8>* + %7 = load <16 x i16>, <16 x i16>* %3, align 2 + %8 = load <16 x i16>, <16 x i16>* %4, align 2 + %9 = lshr <16 x i16> %7, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %10 = lshr <16 x i16> %8, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %11 = trunc <16 x i16> %9 to <16 x i8> + %12 = trunc <16 x i16> %10 to <16 x i8> + store <16 x i8> %11, <16 x i8>* %5, align 1 + store <16 x i8> %12, <16 x i8>* %6, align 1 + ret void +} + +; Store merging must not infinitely fight store splitting. + +define void @store_merge_split(<8 x i32> %w1, <8 x i32> %w2, i64 %idx, <8 x i16>* %p) align 2 { +; SSE2-LABEL: store_merge_split: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: shlq $4, %rdi +; SSE2-NEXT: movdqu %xmm0, (%rsi,%rdi) +; SSE2-NEXT: movdqu %xmm2, 16(%rsi,%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: store_merge_split: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm4, %xmm1 +; SSSE3-NEXT: pshufb %xmm4, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: pshufb %xmm4, %xmm3 +; SSSE3-NEXT: pshufb %xmm4, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSSE3-NEXT: shlq $4, %rdi +; SSSE3-NEXT: movdqu %xmm0, (%rsi,%rdi) +; SSSE3-NEXT: movdqu %xmm2, 16(%rsi,%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: store_merge_split: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: pshufb %xmm4, %xmm1 +; SSE41-NEXT: pshufb %xmm4, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: pshufb %xmm4, %xmm3 +; SSE41-NEXT: pshufb %xmm4, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE41-NEXT: shlq $4, %rdi +; SSE41-NEXT: movdqu %xmm0, (%rsi,%rdi) +; SSE41-NEXT: movdqu %xmm2, 16(%rsi,%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: store_merge_split: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: shlq $4, %rdi +; AVX1-NEXT: vmovdqu %xmm0, (%rsi,%rdi) +; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_merge_split: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: shlq $4, %rdi +; AVX2-NEXT: vmovdqu %xmm0, (%rsi,%rdi) +; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: store_merge_split: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: shlq $4, %rdi +; AVX512F-NEXT: vmovdqu %xmm0, (%rsi,%rdi) +; AVX512F-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: store_merge_split: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: shlq $4, %rdi +; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi,%rdi) +; AVX512VL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: store_merge_split: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: shlq $4, %rdi +; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi,%rdi) +; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: store_merge_split: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: shlq $4, %rdi +; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi,%rdi) +; AVX512BWVL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %t1 = trunc <8 x i32> %w1 to <8 x i16> + %t2 = trunc <8 x i32> %w2 to <8 x i16> + %g1 = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 %idx + %g2 = getelementptr inbounds <8 x i16>, <8 x i16>* %g1, i64 1 + store <8 x i16> %t1, <8 x i16>* %g1, align 2 + store <8 x i16> %t2, <8 x i16>* %g2, align 2 + ret void +} diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index b09d14e5e2b..e9472b80871 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -1998,129 +1998,3 @@ define void @PR34773(i16* %a0, i8* %a1) { store <16 x i8> %12, <16 x i8>* %6, align 1 ret void } - -; Store merging must not infinitely fight store splitting. - -define void @store_merge_split(<8 x i32> %w1, <8 x i32> %w2, i64 %idx, <8 x i16>* %p) align 2 { -; SSE2-LABEL: store_merge_split: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: shlq $4, %rdi -; SSE2-NEXT: movdqu %xmm0, (%rsi,%rdi) -; SSE2-NEXT: movdqu %xmm2, 16(%rsi,%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: store_merge_split: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm4, %xmm1 -; SSSE3-NEXT: pshufb %xmm4, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb %xmm4, %xmm3 -; SSSE3-NEXT: pshufb %xmm4, %xmm2 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSSE3-NEXT: shlq $4, %rdi -; SSSE3-NEXT: movdqu %xmm0, (%rsi,%rdi) -; SSSE3-NEXT: movdqu %xmm2, 16(%rsi,%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: store_merge_split: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: pshufb %xmm4, %xmm1 -; SSE41-NEXT: pshufb %xmm4, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: pshufb %xmm4, %xmm3 -; SSE41-NEXT: pshufb %xmm4, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: shlq $4, %rdi -; SSE41-NEXT: movdqu %xmm0, (%rsi,%rdi) -; SSE41-NEXT: movdqu %xmm2, 16(%rsi,%rdi) -; SSE41-NEXT: retq -; -; AVX1-LABEL: store_merge_split: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: shlq $4, %rdi -; AVX1-NEXT: vmovdqu %xmm0, (%rsi,%rdi) -; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_merge_split: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: shlq $4, %rdi -; AVX2-NEXT: vmovdqu %xmm0, (%rsi,%rdi) -; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: store_merge_split: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512F-NEXT: shlq $4, %rdi -; AVX512F-NEXT: vmovdqu %xmm0, (%rsi,%rdi) -; AVX512F-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: store_merge_split: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: shlq $4, %rdi -; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi,%rdi) -; AVX512VL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: store_merge_split: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512BW-NEXT: shlq $4, %rdi -; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi,%rdi) -; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: store_merge_split: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: shlq $4, %rdi -; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi,%rdi) -; AVX512BWVL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %t1 = trunc <8 x i32> %w1 to <8 x i16> - %t2 = trunc <8 x i32> %w2 to <8 x i16> - %g1 = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 %idx - %g2 = getelementptr inbounds <8 x i16>, <8 x i16>* %g1, i64 1 - store <8 x i16> %t1, <8 x i16>* %g1, align 2 - store <8 x i16> %t2, <8 x i16>* %g2, align 2 - ret void -} diff --git a/llvm/test/CodeGen/X86/vector-zext-widen.ll b/llvm/test/CodeGen/X86/vector-zext-widen.ll new file mode 100644 index 00000000000..78c36d6fc8f --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-zext-widen.ll @@ -0,0 +1,2741 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW + +define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_16i8_to_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: retq +; +; AVX-LABEL: zext_16i8_to_8i16: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: retq +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %C = zext <8 x i8> %B to <8 x i16> + ret <8 x i16> %C +} + +; PR17654 +define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) { +; SSE2-LABEL: zext_16i8_to_16i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_16i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_16i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_16i8_to_16i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_16i8_to_16i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_16i8_to_16i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: retq +entry: + %B = zext <16 x i8> %A to <16 x i16> + ret <16 x i16> %B +} + +define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) { +; SSE2-LABEL: zext_32i8_to_32i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_32i8_to_32i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_32i8_to_32i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_32i8_to_32i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_32i8_to_32i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: zext_32i8_to_32i16: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: zext_32i8_to_32i16: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: retq +entry: + %B = zext <32 x i8> %A to <32 x i16> + ret <32 x i16> %B +} + +define <4 x i32> @zext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_16i8_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: zext_16i8_to_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: retq +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %C = zext <4 x i8> %B to <4 x i32> + ret <4 x i32> %C +} + +define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_16i8_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_16i8_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_16i8_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_16i8_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %C = zext <8 x i8> %B to <8 x i32> + ret <8 x i32> %C +} + +define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_16i8_to_16i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_16i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_16i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_16i8_to_16i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_16i8_to_16i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_16i8_to_16i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = zext <16 x i8> %A to <16 x i32> + ret <16 x i32> %B +} + +define <2 x i64> @zext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_16i8_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: zext_16i8_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1> + %C = zext <2 x i8> %B to <2 x i64> + ret <2 x i64> %C +} + +define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_16i8_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_16i8_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_16i8_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_16i8_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %C = zext <4 x i8> %B to <4 x i64> + ret <4 x i64> %C +} + +define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_16i8_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlq $48, %xmm0 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_16i8_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_16i8_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_16i8_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %C = zext <8 x i8> %B to <8 x i64> + ret <8 x i64> %C +} + +define <4 x i32> @zext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_8i16_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i16_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i16_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: retq +; +; AVX-LABEL: zext_8i16_to_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: retq +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %C = zext <4 x i16> %B to <4 x i32> + ret <4 x i32> %C +} + +define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_8i16_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i16_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i16_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_8i16_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_8i16_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_8i16_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: retq +entry: + %B = zext <8 x i16> %A to <8 x i32> + ret <8 x i32>%B +} + +define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_16i16_to_16i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i16_to_16i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i16_to_16i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_16i16_to_16i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_16i16_to_16i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_16i16_to_16i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512-NEXT: retq +entry: + %B = zext <16 x i16> %A to <16 x i32> + ret <16 x i32> %B +} + +define <2 x i64> @zext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_8i16_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i16_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i16_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: zext_8i16_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: retq +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1> + %C = zext <2 x i16> %B to <2 x i64> + ret <2 x i64> %C +} + +define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_8i16_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i16_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i16_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_8i16_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_8i16_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_8i16_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %C = zext <4 x i16> %B to <4 x i64> + ret <4 x i64> %C +} + +define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_8i16_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i16_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i16_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_8i16_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_8i16_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_8i16_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = zext <8 x i16> %A to <8 x i64> + ret <8 x i64> %B +} + +define <2 x i64> @zext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_4i32_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_4i32_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_4i32_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: retq +; +; AVX-LABEL: zext_4i32_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: retq +entry: + %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1> + %C = zext <2 x i32> %B to <2 x i64> + ret <2 x i64> %C +} + +define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_4i32_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_4i32_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movaps %xmm0, %xmm1 +; SSSE3-NEXT: xorps %xmm2, %xmm2 +; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_4i32_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_4i32_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_4i32_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_4i32_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: retq +entry: + %B = zext <4 x i32> %A to <4 x i64> + ret <4 x i64>%B +} + +define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_8i32_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: xorps %xmm4, %xmm4 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i32_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movaps %xmm1, %xmm3 +; SSSE3-NEXT: movaps %xmm0, %xmm1 +; SSSE3-NEXT: xorps %xmm4, %xmm4 +; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: movaps %xmm3, %xmm2 +; SSSE3-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSSE3-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i32_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_8i32_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_8i32_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_8i32_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512-NEXT: retq +entry: + %B = zext <8 x i32> %A to <8 x i64> + ret <8 x i64>%B +} + +define <2 x i64> @load_zext_2i8_to_2i64(<2 x i8> *%ptr) { +; SSE2-LABEL: load_zext_2i8_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_2i8_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movzwl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_2i8_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: load_zext_2i8_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq +entry: + %X = load <2 x i8>, <2 x i8>* %ptr + %Y = zext <2 x i8> %X to <2 x i64> + ret <2 x i64> %Y +} + +define <4 x i32> @load_zext_4i8_to_4i32(<4 x i8> *%ptr) { +; SSE2-LABEL: load_zext_4i8_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_4i8_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_4i8_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: load_zext_4i8_to_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: retq +entry: + %X = load <4 x i8>, <4 x i8>* %ptr + %Y = zext <4 x i8> %X to <4 x i32> + ret <4 x i32> %Y +} + +define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) { +; SSE2-LABEL: load_zext_4i8_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_4i8_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_4i8_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_zext_4i8_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zext_4i8_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_zext_4i8_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: retq +entry: + %X = load <4 x i8>, <4 x i8>* %ptr + %Y = zext <4 x i8> %X to <4 x i64> + ret <4 x i64> %Y +} + +define <8 x i16> @load_zext_8i8_to_8i16(<8 x i8> *%ptr) { +; SSE2-LABEL: load_zext_8i8_to_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_8i8_to_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_8i8_to_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; SSE41-NEXT: retq +; +; AVX-LABEL: load_zext_8i8_to_8i16: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX-NEXT: retq +entry: + %X = load <8 x i8>, <8 x i8>* %ptr + %Y = zext <8 x i8> %X to <8 x i16> + ret <8 x i16> %Y +} + +define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) { +; SSE2-LABEL: load_zext_8i8_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_8i8_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_8i8_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_zext_8i8_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zext_8i8_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_zext_8i8_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: retq +entry: + %X = load <8 x i8>, <8 x i8>* %ptr + %Y = zext <8 x i8> %X to <8 x i32> + ret <8 x i32> %Y +} + +define <8 x i32> @load_zext_16i8_to_8i32(<16 x i8> *%ptr) { +; SSE2-LABEL: load_zext_16i8_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_16i8_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_16i8_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_zext_16i8_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zext_16i8_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_zext_16i8_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: retq +entry: + %X = load <16 x i8>, <16 x i8>* %ptr + %Y = shufflevector <16 x i8> %X, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %Z = zext <8 x i8> %Y to <8 x i32> + ret <8 x i32> %Z +} + +define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) { +; SSE2-LABEL: load_zext_8i8_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_8i8_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_8i8_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_zext_8i8_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zext_8i8_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_zext_8i8_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: retq +entry: + %X = load <8 x i8>, <8 x i8>* %ptr + %Y = zext <8 x i8> %X to <8 x i64> + ret <8 x i64> %Y +} + +define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) { +; SSE2-LABEL: load_zext_16i8_to_16i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_16i8_to_16i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_16i8_to_16i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_zext_16i8_to_16i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zext_16i8_to_16i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_zext_16i8_to_16i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: retq +entry: + %X = load <16 x i8>, <16 x i8>* %ptr + %Y = zext <16 x i8> %X to <16 x i16> + ret <16 x i16> %Y +} + +define <2 x i64> @load_zext_2i16_to_2i64(<2 x i16> *%ptr) { +; SSE2-LABEL: load_zext_2i16_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_2i16_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_2i16_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: load_zext_2i16_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX-NEXT: retq +entry: + %X = load <2 x i16>, <2 x i16>* %ptr + %Y = zext <2 x i16> %X to <2 x i64> + ret <2 x i64> %Y +} + +define <4 x i32> @load_zext_4i16_to_4i32(<4 x i16> *%ptr) { +; SSE2-LABEL: load_zext_4i16_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_4i16_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_4i16_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: retq +; +; AVX-LABEL: load_zext_4i16_to_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX-NEXT: retq +entry: + %X = load <4 x i16>, <4 x i16>* %ptr + %Y = zext <4 x i16> %X to <4 x i32> + ret <4 x i32> %Y +} + +define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) { +; SSE2-LABEL: load_zext_4i16_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_4i16_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_4i16_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_zext_4i16_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zext_4i16_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_zext_4i16_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX512-NEXT: retq +entry: + %X = load <4 x i16>, <4 x i16>* %ptr + %Y = zext <4 x i16> %X to <4 x i64> + ret <4 x i64> %Y +} + +define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) { +; SSE2-LABEL: load_zext_8i16_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_8i16_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_8i16_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_zext_8i16_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zext_8i16_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_zext_8i16_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: retq +entry: + %X = load <8 x i16>, <8 x i16>* %ptr + %Y = zext <8 x i16> %X to <8 x i32> + ret <8 x i32> %Y +} + +define <2 x i64> @load_zext_2i32_to_2i64(<2 x i32> *%ptr) { +; SSE2-LABEL: load_zext_2i32_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_2i32_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_2i32_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; SSE41-NEXT: retq +; +; AVX-LABEL: load_zext_2i32_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; AVX-NEXT: retq +entry: + %X = load <2 x i32>, <2 x i32>* %ptr + %Y = zext <2 x i32> %X to <2 x i64> + ret <2 x i64> %Y +} + +define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) { +; SSE2-LABEL: load_zext_4i32_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movaps (%rdi), %xmm1 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_4i32_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movaps (%rdi), %xmm1 +; SSSE3-NEXT: xorps %xmm2, %xmm2 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_4i32_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_zext_4i32_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zext_4i32_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_zext_4i32_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX512-NEXT: retq +entry: + %X = load <4 x i32>, <4 x i32>* %ptr + %Y = zext <4 x i32> %X to <4 x i64> + ret <4 x i64> %Y +} + +define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) { +; SSE2-LABEL: zext_8i8_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i8_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i8_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_8i8_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_8i8_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_8i8_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: retq +entry: + %t = zext <8 x i8> %z to <8 x i32> + ret <8 x i32> %t +} + +define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_8i16_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_8i16_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_8i16_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_8i16_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_8i16_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_8i16_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8> + %Z = bitcast <16 x i16> %B to <8 x i32> + ret <8 x i32> %Z +} + +define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_4i32_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_4i32_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movaps %xmm0, %xmm1 +; SSSE3-NEXT: xorps %xmm2, %xmm2 +; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_4i32_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_4i32_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_4i32_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_4i32_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4> + %Z = bitcast <8 x i32> %B to <4 x i64> + ret <4 x i64> %Z +} + +define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) { +; SSE2-LABEL: shuf_zext_8i8_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_8i8_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_8i8_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_8i8_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_8i8_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_8i8_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <8 x i8> %A, <8 x i8> zeroinitializer, <32 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8, i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8, i32 6, i32 8, i32 8, i32 8, i32 7, i32 8, i32 8, i32 8> + %Z = bitcast <32 x i8> %B to <8 x i32> + ret <8 x i32> %Z +} + +define <2 x i64> @shuf_zext_16i8_to_2i64_offset6(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_16i8_to_2i64_offset6: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_16i8_to_2i64_offset6: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_16i8_to_2i64_offset6: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrlq $48, %xmm0 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_16i8_to_2i64_offset6: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuf_zext_16i8_to_2i64_offset6: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuf_zext_16i8_to_2i64_offset6: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: shuf_zext_16i8_to_2i64_offset6: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: shuf_zext_16i8_to_2i64_offset6: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: retq +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <16 x i32> <i32 6, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + %Z = bitcast <16 x i8> %B to <2 x i64> + ret <2 x i64> %Z +} + +define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_16i8_to_4i64_offset11: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_16i8_to_4i64_offset11: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[11],zero,zero,zero,zero,zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[13],zero,zero,zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_16i8_to_4i64_offset11: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrldq {{.*#+}} xmm1 = xmm1[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_16i8_to_4i64_offset11: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_16i8_to_4i64_offset11: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_16i8_to_4i64_offset11: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <32 x i32> <i32 11, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 12, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 13, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 14, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + %Z = bitcast <32 x i8> %B to <4 x i64> + ret <4 x i64> %Z +} + +define <2 x i64> @shuf_zext_8i16_to_2i64_offset6(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_8i16_to_2i64_offset6: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_8i16_to_2i64_offset6: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_8i16_to_2i64_offset6: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_8i16_to_2i64_offset6: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuf_zext_8i16_to_2i64_offset6: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuf_zext_8i16_to_2i64_offset6: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: shuf_zext_8i16_to_2i64_offset6: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: shuf_zext_8i16_to_2i64_offset6: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: retq +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8> + %Z = bitcast <8 x i16> %B to <2 x i64> + ret <2 x i64> %Z +} + +define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_8i16_to_4i64_offset2: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_8i16_to_4i64_offset2: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_8i16_to_4i64_offset2: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_8i16_to_4i64_offset2: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_8i16_to_4i64_offset2: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_8i16_to_4i64_offset2: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8> + %Z = bitcast <16 x i16> %B to <4 x i64> + ret <4 x i64> %Z +} + +define <4 x i32> @shuf_zext_8i16_to_4i32_offset1(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_8i16_to_4i32_offset1: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_8i16_to_4i32_offset1: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_8i16_to_4i32_offset1: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero +; AVX512BW-NEXT: retq +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8> + %Z = bitcast <8 x i16> %B to <4 x i32> + ret <4 x i32> %Z +} + +define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_8i16_to_8i32_offset3: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_8i16_to_8i32_offset3: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_8i16_to_8i32_offset3: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE41-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_8i16_to_8i32_offset3: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_8i16_to_8i32_offset3: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_8i16_to_8i32_offset3: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8, i32 undef, i32 8, i32 undef, i32 8, i32 undef, i32 8> + %Z = bitcast <16 x i16> %B to <8 x i32> + ret <8 x i32> %Z +} + +define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_16i16_to_8i32_offset8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_16i16_to_8i32_offset8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_16i16_to_8i32_offset8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_16i16_to_8i32_offset8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_16i16_to_8i32_offset8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_16i16_to_8i32_offset8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <16 x i16> %A, <16 x i16> zeroinitializer, <16 x i32> <i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 undef, i32 16, i32 14, i32 16, i32 undef, i32 16> + %Z = bitcast <16 x i16> %B to <8 x i32> + ret <8 x i32> %Z +} + +define <2 x i64> @shuf_zext_4i32_to_2i64_offset2(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE-LABEL: shuf_zext_4i32_to_2i64_offset2: +; SSE: # %bb.0: # %entry +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuf_zext_4i32_to_2i64_offset2: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: retq +entry: + %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 3, i32 4> + %Z = bitcast <4 x i32> %B to <2 x i64> + ret <2 x i64> %Z +} + +define <4 x i64> @shuf_zext_4i32_to_4i64_offset1(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_4i32_to_4i64_offset1: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,0,4294967295,0] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_4i32_to_4i64_offset1: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,0,4294967295,0] +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_4i32_to_4i64_offset1: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; SSE41-NEXT: psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_4i32_to_4i64_offset1: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_4i32_to_4i64_offset1: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_4i32_to_4i64_offset1: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 undef, i32 4, i32 2, i32 4, i32 3, i32 4, i32 undef, i32 4> + %Z = bitcast <8 x i32> %B to <4 x i64> + ret <4 x i64> %Z +} + +define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) { +; SSE2-LABEL: zext_32i8_to_32i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, 112(%rdi) +; SSE2-NEXT: movdqa %xmm4, 96(%rdi) +; SSE2-NEXT: movdqa %xmm6, 80(%rdi) +; SSE2-NEXT: movdqa %xmm7, 64(%rdi) +; SSE2-NEXT: movdqa %xmm0, 48(%rdi) +; SSE2-NEXT: movdqa %xmm5, 32(%rdi) +; SSE2-NEXT: movdqa %xmm3, 16(%rdi) +; SSE2-NEXT: movdqa %xmm8, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_32i8_to_32i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq %rdi, %rax +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm3, %xmm8 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm1, 112(%rdi) +; SSSE3-NEXT: movdqa %xmm4, 96(%rdi) +; SSSE3-NEXT: movdqa %xmm6, 80(%rdi) +; SSSE3-NEXT: movdqa %xmm7, 64(%rdi) +; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) +; SSSE3-NEXT: movdqa %xmm5, 32(%rdi) +; SSSE3-NEXT: movdqa %xmm3, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm8, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_32i8_to_32i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movq %rdi, %rax +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE41-NEXT: movdqa %xmm1, 112(%rdi) +; SSE41-NEXT: movdqa %xmm7, 96(%rdi) +; SSE41-NEXT: movdqa %xmm6, 80(%rdi) +; SSE41-NEXT: movdqa %xmm5, 64(%rdi) +; SSE41-NEXT: movdqa %xmm0, 48(%rdi) +; SSE41-NEXT: movdqa %xmm4, 32(%rdi) +; SSE41-NEXT: movdqa %xmm3, 16(%rdi) +; SSE41-NEXT: movdqa %xmm2, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_32i8_to_32i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-NEXT: vmovaps %ymm4, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_32i8_to_32i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vmovdqa %ymm4, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_32i8_to_32i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-NEXT: retq + %res = zext <32 x i8>%x to <32 x i32> + ret <32 x i32> %res +} + +define <2 x i32> @zext_2i8_to_2i32(<2 x i8>* %addr) { +; SSE2-LABEL: zext_2i8_to_2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_2i8_to_2i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movzwl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: paddd %xmm0, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_2i8_to_2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: paddd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: zext_2i8_to_2i32: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %x = load <2 x i8>, <2 x i8>* %addr, align 1 + %y = zext <2 x i8> %x to <2 x i32> + %z = add <2 x i32>%y, %y + ret <2 x i32>%z +} + +define <4 x i32> @zext_4i17_to_4i32(<4 x i17>* %ptr) { +; SSE2-LABEL: zext_4i17_to_4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq (%rdi), %rax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq $17, %rcx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movl 8(%rdi), %ecx +; SSE2-NEXT: shll $13, %ecx +; SSE2-NEXT: movq %rax, %rdx +; SSE2-NEXT: shrq $51, %rdx +; SSE2-NEXT: orl %ecx, %edx +; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: shrq $34, %rax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_4i17_to_4i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq (%rdi), %rax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: shrq $17, %rcx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movl 8(%rdi), %ecx +; SSSE3-NEXT: shll $13, %ecx +; SSSE3-NEXT: movq %rax, %rdx +; SSSE3-NEXT: shrq $51, %rdx +; SSSE3-NEXT: orl %ecx, %edx +; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: shrq $34, %rax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_4i17_to_4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movl 8(%rdi), %eax +; SSE41-NEXT: shll $13, %eax +; SSE41-NEXT: movq (%rdi), %rcx +; SSE41-NEXT: movq %rcx, %rdx +; SSE41-NEXT: shrq $51, %rdx +; SSE41-NEXT: orl %eax, %edx +; SSE41-NEXT: movq %rcx, %rax +; SSE41-NEXT: shrq $17, %rax +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: shrq $34, %rcx +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: pinsrd $3, %edx, %xmm0 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_4i17_to_4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: movl 8(%rdi), %eax +; AVX1-NEXT: shll $13, %eax +; AVX1-NEXT: movq (%rdi), %rcx +; AVX1-NEXT: movq %rcx, %rdx +; AVX1-NEXT: shrq $51, %rdx +; AVX1-NEXT: orl %eax, %edx +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: shrq $17, %rax +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: shrq $34, %rcx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_4i17_to_4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movl 8(%rdi), %eax +; AVX2-NEXT: shll $13, %eax +; AVX2-NEXT: movq (%rdi), %rcx +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: shrq $51, %rdx +; AVX2-NEXT: orl %eax, %edx +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: shrq $17, %rax +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: shrq $34, %rcx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [131071,131071,131071,131071] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_4i17_to_4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: movl 8(%rdi), %eax +; AVX512-NEXT: shll $13, %eax +; AVX512-NEXT: movq (%rdi), %rcx +; AVX512-NEXT: movq %rcx, %rdx +; AVX512-NEXT: shrq $51, %rdx +; AVX512-NEXT: orl %eax, %edx +; AVX512-NEXT: movq %rcx, %rax +; AVX512-NEXT: shrq $17, %rax +; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512-NEXT: shrq $34, %rcx +; AVX512-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [131071,131071,131071,131071] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %a = load <4 x i17>, <4 x i17>* %ptr + %b = zext <4 x i17> %a to <4 x i32> + ret <4 x i32> %b +} + +define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_8i6_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE2-NEXT: paddw {{.*}}(%rip), %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,63] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i6_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSSE3-NEXT: paddw {{.*}}(%rip), %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [63,63] +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3] +; SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] +; SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3] +; SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7] +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i6_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE41-NEXT: paddw {{.*}}(%rip), %xmm3 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [63,63] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: pand %xmm4, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_8i6_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_8i6_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_8i6_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovd %edi, %xmm0 +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: retq +entry: + %a = trunc i32 %x to i6 + %b = insertelement <8 x i6> undef, i6 %a, i32 0 + %c = shufflevector <8 x i6> %b, <8 x i6> undef, <8 x i32> zeroinitializer + %d = add <8 x i6> %c, <i6 0, i6 1, i6 2, i6 3, i6 4, i6 5, i6 6, i6 7> + %e = zext <8 x i6> %d to <8 x i64> + ret <8 x i64> %e +} + +define <4 x i64> @splatshuf_zext_v4i64(<4 x i32> %x) { +; SSE2-LABEL: splatshuf_zext_v4i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: splatshuf_zext_v4i64: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: splatshuf_zext_v4i64: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatshuf_zext_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatshuf_zext_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: splatshuf_zext_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: retq + %shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer + %ext = zext <4 x i32> %shuf to <4 x i64> + ret <4 x i64> %ext +} + +define <8 x i32> @splatshuf_zext_v8i32_matching_undefs(<8 x i16> %x) { +; SSE2-LABEL: splatshuf_zext_v8i32_matching_undefs: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: splatshuf_zext_v8i32_matching_undefs: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[u,u],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: splatshuf_zext_v8i32_matching_undefs: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,8,9,10,11,12,13,14,15] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatshuf_zext_v8i32_matching_undefs: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[6,7],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatshuf_zext_v8i32_matching_undefs: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: splatshuf_zext_v8i32_matching_undefs: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15] +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: retq + %shuf = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 3, i32 7, i32 0, i32 undef, i32 3, i32 7> + %ext = zext <8 x i16> %shuf to <8 x i32> + ret <8 x i32> %ext +} + +define <8 x i32> @splatshuf_zext_v8i32_unmatched_undef(<8 x i16> %x) { +; SSE2-LABEL: splatshuf_zext_v8i32_unmatched_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: splatshuf_zext_v8i32_unmatched_undef: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: splatshuf_zext_v8i32_unmatched_undef: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,14,15,6,7,12,13,14,15] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatshuf_zext_v8i32_unmatched_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatshuf_zext_v8i32_unmatched_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: splatshuf_zext_v8i32_unmatched_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15] +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: retq + %shuf = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 7, i32 0, i32 undef, i32 3, i32 7> + %ext = zext <8 x i16> %shuf to <8 x i32> + ret <8 x i32> %ext +} + +define <16 x i16> @splatshuf_zext_v16i16(<16 x i8> %x) { +; SSE2-LABEL: splatshuf_zext_v16i16: +; SSE2: # %bb.0: +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: splatshuf_zext_v16i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: splatshuf_zext_v16i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,15,15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatshuf_zext_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatshuf_zext_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: splatshuf_zext_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: retq + %shuf = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14> + %ext = zext <16 x i8> %shuf to <16 x i16> + ret <16 x i16> %ext +} diff --git a/llvm/test/CodeGen/X86/widen_cast-4.ll b/llvm/test/CodeGen/X86/widen_cast-4.ll index f317d4b5913..949b6dd9db0 100644 --- a/llvm/test/CodeGen/X86/widen_cast-4.ll +++ b/llvm/test/CodeGen/X86/widen_cast-4.ll @@ -1,9 +1,45 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=WIDE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=NARROW +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=WIDE ; FIXME: We shouldn't require both a movd and an insert in the wide version. define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind { +; NARROW-LABEL: update: +; NARROW: # %bb.0: # %entry +; NARROW-NEXT: subl $12, %esp +; NARROW-NEXT: movl $0, (%esp) +; NARROW-NEXT: pcmpeqd %xmm0, %xmm0 +; NARROW-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; NARROW-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; NARROW-NEXT: .p2align 4, 0x90 +; NARROW-NEXT: .LBB0_1: # %forcond +; NARROW-NEXT: # =>This Inner Loop Header: Depth=1 +; NARROW-NEXT: movl (%esp), %eax +; NARROW-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; NARROW-NEXT: jge .LBB0_3 +; NARROW-NEXT: # %bb.2: # %forbody +; NARROW-NEXT: # in Loop: Header=BB0_1 Depth=1 +; NARROW-NEXT: movl (%esp), %eax +; NARROW-NEXT: leal (,%eax,8), %ecx +; NARROW-NEXT: movl {{[0-9]+}}(%esp), %edx +; NARROW-NEXT: addl %ecx, %edx +; NARROW-NEXT: movl %edx, {{[0-9]+}}(%esp) +; NARROW-NEXT: addl {{[0-9]+}}(%esp), %ecx +; NARROW-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NARROW-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; NARROW-NEXT: psubb %xmm0, %xmm3 +; NARROW-NEXT: psrlw $2, %xmm3 +; NARROW-NEXT: pand %xmm1, %xmm3 +; NARROW-NEXT: pxor %xmm2, %xmm3 +; NARROW-NEXT: psubb %xmm2, %xmm3 +; NARROW-NEXT: movq %xmm3, (%edx,%eax,8) +; NARROW-NEXT: incl (%esp) +; NARROW-NEXT: jmp .LBB0_1 +; NARROW-NEXT: .LBB0_3: # %afterfor +; NARROW-NEXT: addl $12, %esp +; NARROW-NEXT: retl +; ; WIDE-LABEL: update: ; WIDE: # %bb.0: # %entry ; WIDE-NEXT: subl $12, %esp diff --git a/llvm/test/CodeGen/X86/widen_conversions.ll b/llvm/test/CodeGen/X86/widen_conversions.ll index d9d43bb4ad2..acd8c78fa2d 100644 --- a/llvm/test/CodeGen/X86/widen_conversions.ll +++ b/llvm/test/CodeGen/X86/widen_conversions.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=X64 define <4 x i32> @zext_v4i8_to_v4i32(<4 x i8>* %ptr) { ; X86-LABEL: zext_v4i8_to_v4i32: diff --git a/llvm/test/CodeGen/X86/widen_mul.ll b/llvm/test/CodeGen/X86/widen_mul.ll index 783ca948b0a..b3ccb961c55 100644 --- a/llvm/test/CodeGen/X86/widen_mul.ll +++ b/llvm/test/CodeGen/X86/widen_mul.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW ; Test multiplies of various narrow types. |