diff options
| author | Cong Hou <congh@google.com> | 2015-11-24 05:44:19 +0000 |
|---|---|---|
| committer | Cong Hou <congh@google.com> | 2015-11-24 05:44:19 +0000 |
| commit | bed60d35ed332f12a7c54eb16e59b69adb5d8c15 (patch) | |
| tree | 44020f50bbb10215241bea640004e32eea66749c /llvm/test/CodeGen/X86/avg.ll | |
| parent | d68ba4255627a31196d13e6cde0568f643bd94d7 (diff) | |
| download | bcm5719-llvm-bed60d35ed332f12a7c54eb16e59b69adb5d8c15.tar.gz bcm5719-llvm-bed60d35ed332f12a7c54eb16e59b69adb5d8c15.zip | |
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
Diffstat (limited to 'llvm/test/CodeGen/X86/avg.ll')
| -rw-r--r-- | llvm/test/CodeGen/X86/avg.ll | 627 |
1 files changed, 627 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll new file mode 100644 index 00000000000..ce2bf0fdad1 --- /dev/null +++ b/llvm/test/CodeGen/X86/avg.ll @@ -0,0 +1,627 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512BW + +define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: avg_v4i8 +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd (%rsi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i8 +; AVX2: # BB#0: +; AVX2-NEXT: vmovd (%rdi), %xmm0 +; AVX2-NEXT: vmovd (%rsi), %xmm1 +; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = load <4 x i8>, <4 x i8>* %b + %3 = zext <4 x i8> %1 to <4 x i32> + %4 = zext <4 x i8> %2 to <4 x i32> + %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> + %6 = add nuw nsw <4 x i32> %5, %4 + %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1> + %8 = trunc <4 x i32> %7 to <4 x i8> + store <4 x i8> %8, <4 x i8>* undef, align 4 + ret void +} + +define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) { +; SSE2-LABEL: avg_v8i8 +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i8 +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vmovq (%rsi), %xmm1 +; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = load <8 x i8>, <8 x i8>* %b + %3 = zext <8 x i8> %1 to <8 x i32> + %4 = zext <8 x i8> %2 to <8 x i32> + %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %6 = add nuw nsw <8 x i32> %5, %4 + %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %8 = trunc <8 x i32> %7 to <8 x i8> + store <8 x i8> %8, <8 x i8>* undef, align 4 + ret void +} + +define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) { +; SSE2-LABEL: avg_v16i8 +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: pavgb (%rdi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v16i8 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-NEXT: vpavgb (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = load <16 x i8>, <16 x i8>* %b + %3 = zext <16 x i8> %1 to <16 x i32> + %4 = zext <16 x i8> %2 to <16 x i32> + %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %6 = add nuw nsw <16 x i32> %5, %4 + %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %8 = trunc <16 x i32> %7 to <16 x i8> + store <16 x i8> %8, <16 x i8>* undef, align 4 + ret void +} + +define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { +; AVX2-LABEL: avg_v32i8 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <32 x i8>, <32 x i8>* %a + %2 = load <32 x i8>, <32 x i8>* %b + %3 = zext <32 x i8> %1 to <32 x i32> + %4 = zext <32 x i8> %2 to <32 x i32> + %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %6 = add nuw nsw <32 x i32> %5, %4 + %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %8 = trunc <32 x i32> %7 to <32 x i8> + store <32 x i8> %8, <32 x i8>* undef, align 4 + ret void +} + +define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { +; AVX512BW-LABEL: avg_v64i8 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 +; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = load <64 x i8>, <64 x i8>* %a + %2 = load <64 x i8>, <64 x i8>* %b + %3 = zext <64 x i8> %1 to <64 x i32> + %4 = zext <64 x i8> %2 to <64 x i32> + %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %6 = add nuw nsw <64 x i32> %5, %4 + %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %8 = trunc <64 x i32> %7 to <64 x i8> + store <64 x i8> %8, <64 x i8>* undef, align 4 + ret void +} + +define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) { +; SSE2-LABEL: avg_v4i16 +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero +; SSE2-NEXT: pavgw %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i16 +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vmovq (%rsi), %xmm1 +; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = load <4 x i16>, <4 x i16>* %b + %3 = zext <4 x i16> %1 to <4 x i32> + %4 = zext <4 x i16> %2 to <4 x i32> + %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> + %6 = add nuw nsw <4 x i32> %5, %4 + %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1> + %8 = trunc <4 x i32> %7 to <4 x i16> + store <4 x i16> %8, <4 x i16>* undef, align 4 + ret void +} + +define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) { +; SSE2-LABEL: avg_v8i16 +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: pavgw (%rdi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i16 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-NEXT: vpavgw (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i16>, <8 x i16>* %a + %2 = load <8 x i16>, <8 x i16>* %b + %3 = zext <8 x i16> %1 to <8 x i32> + %4 = zext <8 x i16> %2 to <8 x i32> + %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %6 = add nuw nsw <8 x i32> %5, %4 + %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %8 = trunc <8 x i32> %7 to <8 x i16> + store <8 x i16> %8, <8 x i16>* undef, align 4 + ret void +} + +define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { +; AVX2-LABEL: avg_v16i16 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <16 x i16>, <16 x i16>* %a + %2 = load <16 x i16>, <16 x i16>* %b + %3 = zext <16 x i16> %1 to <16 x i32> + %4 = zext <16 x i16> %2 to <16 x i32> + %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %6 = add nuw nsw <16 x i32> %5, %4 + %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %8 = trunc <16 x i32> %7 to <16 x i16> + store <16 x i16> %8, <16 x i16>* undef, align 4 + ret void +} + +define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { +; AVX512BW-LABEL: avg_v32i16 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 +; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = load <32 x i16>, <32 x i16>* %a + %2 = load <32 x i16>, <32 x i16>* %b + %3 = zext <32 x i16> %1 to <32 x i32> + %4 = zext <32 x i16> %2 to <32 x i32> + %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %6 = add nuw nsw <32 x i32> %5, %4 + %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %8 = trunc <32 x i32> %7 to <32 x i16> + store <32 x i16> %8, <32 x i16>* undef, align 4 + ret void +} + +define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: avg_v4i8_2 +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm0 +; SSE2-NEXT: movd (%rsi), %xmm1 +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i8_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovd (%rdi), %xmm0 +; AVX2-NEXT: vmovd (%rsi), %xmm1 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = load <4 x i8>, <4 x i8>* %b + %3 = zext <4 x i8> %1 to <4 x i32> + %4 = zext <4 x i8> %2 to <4 x i32> + %5 = add nuw nsw <4 x i32> %3, %4 + %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1> + %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1> + %8 = trunc <4 x i32> %7 to <4 x i8> + store <4 x i8> %8, <4 x i8>* undef, align 4 + ret void +} + +define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) { +; SSE2-LABEL: avg_v8i8_2 +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i8_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vmovq (%rsi), %xmm1 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = load <8 x i8>, <8 x i8>* %b + %3 = zext <8 x i8> %1 to <8 x i32> + %4 = zext <8 x i8> %2 to <8 x i32> + %5 = add nuw nsw <8 x i32> %3, %4 + %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %8 = trunc <8 x i32> %7 to <8 x i8> + store <8 x i8> %8, <8 x i8>* undef, align 4 + ret void +} + +define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) { +; SSE2-LABEL: avg_v16i8_2 +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v16i8_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = load <16 x i8>, <16 x i8>* %b + %3 = zext <16 x i8> %1 to <16 x i32> + %4 = zext <16 x i8> %2 to <16 x i32> + %5 = add nuw nsw <16 x i32> %3, %4 + %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %8 = trunc <16 x i32> %7 to <16 x i8> + store <16 x i8> %8, <16 x i8>* undef, align 4 + ret void +} + +define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) { +; AVX2-LABEL: avg_v32i8_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <32 x i8>, <32 x i8>* %a + %2 = load <32 x i8>, <32 x i8>* %b + %3 = zext <32 x i8> %1 to <32 x i32> + %4 = zext <32 x i8> %2 to <32 x i32> + %5 = add nuw nsw <32 x i32> %3, %4 + %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %8 = trunc <32 x i32> %7 to <32 x i8> + store <32 x i8> %8, <32 x i8>* undef, align 4 + ret void +} + +define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) { +; AVX512BW-LABEL: avg_v64i8_2 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 +; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = load <64 x i8>, <64 x i8>* %a + %2 = load <64 x i8>, <64 x i8>* %b + %3 = zext <64 x i8> %1 to <64 x i32> + %4 = zext <64 x i8> %2 to <64 x i32> + %5 = add nuw nsw <64 x i32> %4, %4 + %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %8 = trunc <64 x i32> %7 to <64 x i8> + store <64 x i8> %8, <64 x i8>* undef, align 4 + ret void +} + + +define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) { +; SSE2-LABEL: avg_v4i16_2 +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero +; SSE2-NEXT: pavgw %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i16_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vmovq (%rsi), %xmm1 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = load <4 x i16>, <4 x i16>* %b + %3 = zext <4 x i16> %1 to <4 x i32> + %4 = zext <4 x i16> %2 to <4 x i32> + %5 = add nuw nsw <4 x i32> %3, %4 + %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1> + %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1> + %8 = trunc <4 x i32> %7 to <4 x i16> + store <4 x i16> %8, <4 x i16>* undef, align 4 + ret void +} + +define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) { +; SSE2-LABEL: avg_v8i16_2 +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgw (%rsi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i16_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpavgw (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i16>, <8 x i16>* %a + %2 = load <8 x i16>, <8 x i16>* %b + %3 = zext <8 x i16> %1 to <8 x i32> + %4 = zext <8 x i16> %2 to <8 x i32> + %5 = add nuw nsw <8 x i32> %3, %4 + %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %8 = trunc <8 x i32> %7 to <8 x i16> + store <8 x i16> %8, <8 x i16>* undef, align 4 + ret void +} + +define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { +; AVX2-LABEL: avg_v16i16_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <16 x i16>, <16 x i16>* %a + %2 = load <16 x i16>, <16 x i16>* %b + %3 = zext <16 x i16> %1 to <16 x i32> + %4 = zext <16 x i16> %2 to <16 x i32> + %5 = add nuw nsw <16 x i32> %3, %4 + %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %8 = trunc <16 x i32> %7 to <16 x i16> + store <16 x i16> %8, <16 x i16>* undef, align 4 + ret void +} + +define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { +; AVX512BW-LABEL: avg_v32i16_2 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = load <32 x i16>, <32 x i16>* %a + %2 = load <32 x i16>, <32 x i16>* %b + %3 = zext <32 x i16> %1 to <32 x i32> + %4 = zext <32 x i16> %2 to <32 x i32> + %5 = add nuw nsw <32 x i32> %3, %4 + %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %8 = trunc <32 x i32> %7 to <32 x i16> + store <32 x i16> %8, <32 x i16>* undef, align 4 + ret void +} + +define void @avg_v4i8_const(<4 x i8>* %a) { +; SSE2-LABEL: avg_v4i8_const +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pavgb {{.*}}, %xmm0 +; SSE2-NEXT: movd %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i8_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovd (%rdi), %xmm0 +; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4> + %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> + %5 = trunc <4 x i32> %4 to <4 x i8> + store <4 x i8> %5, <4 x i8>* undef, align 4 + ret void +} + +define void @avg_v8i8_const(<8 x i8>* %a) { +; SSE2-LABEL: avg_v8i8_const +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE2-NEXT: pavgb {{.*}}, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i8_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> + %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %5 = trunc <8 x i32> %4 to <8 x i8> + store <8 x i8> %5, <8 x i8>* undef, align 4 + ret void +} + +define void @avg_v16i8_const(<16 x i8>* %a) { +; SSE2-LABEL: avg_v16i8_const +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgb {{.*}}, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v16i8_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = zext <16 x i8> %1 to <16 x i32> + %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> + %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %5 = trunc <16 x i32> %4 to <16 x i8> + store <16 x i8> %5, <16 x i8>* undef, align 4 + ret void +} + +define void @avg_v32i8_const(<32 x i8>* %a) { +; AVX2-LABEL: avg_v32i8_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb {{.*}}, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <32 x i8>, <32 x i8>* %a + %2 = zext <32 x i8> %1 to <32 x i32> + %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> + %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %5 = trunc <32 x i32> %4 to <32 x i8> + store <32 x i8> %5, <32 x i8>* undef, align 4 + ret void +} + +define void @avg_v64i8_const(<64 x i8>* %a) { +; AVX512BW-LABEL: avg_v64i8_const +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgb {{.*}}, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = load <64 x i8>, <64 x i8>* %a + %2 = zext <64 x i8> %1 to <64 x i32> + %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> + %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %5 = trunc <64 x i32> %4 to <64 x i8> + store <64 x i8> %5, <64 x i8>* undef, align 4 + ret void +} + +define void @avg_v4i16_const(<4 x i16>* %a) { +; SSE2-LABEL: avg_v4i16_const +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 +; SSE2-NEXT: pavgw {{.*}}, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i16_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vpavgw {{.*}}, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4> + %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> + %5 = trunc <4 x i32> %4 to <4 x i16> + store <4 x i16> %5, <4 x i16>* undef, align 4 + ret void +} + +define void @avg_v8i16_const(<8 x i16>* %a) { +; SSE2-LABEL: avg_v8i16_const +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgw {{.*}}, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i16_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpavgw {{.*}}, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i16>, <8 x i16>* %a + %2 = zext <8 x i16> %1 to <8 x i32> + %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> + %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %5 = trunc <8 x i32> %4 to <8 x i16> + store <8 x i16> %5, <8 x i16>* undef, align 4 + ret void +} + +define void @avg_v16i16_const(<16 x i16>* %a) { +; AVX2-LABEL: avg_v16i16_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgw {{.*}}, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <16 x i16>, <16 x i16>* %a + %2 = zext <16 x i16> %1 to <16 x i32> + %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> + %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %5 = trunc <16 x i32> %4 to <16 x i16> + store <16 x i16> %5, <16 x i16>* undef, align 4 + ret void +} + +define void @avg_v32i16_const(<32 x i16>* %a) { +; AVX512BW-LABEL: avg_v32i16_const +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgw {{.*}}, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; + %1 = load <32 x i16>, <32 x i16>* %a + %2 = zext <32 x i16> %1 to <32 x i32> + %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> + %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %5 = trunc <32 x i32> %4 to <32 x i16> + store <32 x i16> %5, <32 x i16>* undef, align 4 + ret void +} |

