diff options
Diffstat (limited to 'llvm/test/CodeGen/X86/stack-folding-int-avx512.ll')
-rw-r--r-- | llvm/test/CodeGen/X86/stack-folding-int-avx512.ll | 4842 |
1 files changed, 4036 insertions, 806 deletions
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll index eed9b36208c..ce6b1b5fb9c 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd,+avx512vpopcntdq,+avx512vnni < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -9,16 +10,45 @@ target triple = "x86_64-unknown-unknown" ; relevant registers and check that the reload is correctly folded into the instruction. define <16 x i32> @stack_fold_valignd(<16 x i32> %a, <16 x i32> %b) { - ;CHECK-LABEL: stack_fold_valignd - ;CHECK: valignd $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_valignd: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16> ret <16 x i32> %2 } define <16 x i32> @stack_fold_valignd_mask(<16 x i32> %a, <16 x i32> %b, <16 x i32>* %passthru, i16 %mask) { - ;CHECK-LABEL: stack_fold_valignd_mask - ;CHECK: valignd $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_valignd_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16> %3 = bitcast i16 %mask to <16 x i1> @@ -28,8 +58,22 @@ define <16 x i32> @stack_fold_valignd_mask(<16 x i32> %a, <16 x i32> %b, <16 x i } define <16 x i32> @stack_fold_valignd_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) { - ;CHECK-LABEL: stack_fold_valignd_maskz - ;CHECK: valignd $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_valignd_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16> %3 = bitcast i16 %mask to <16 x i1> @@ -38,16 +82,45 @@ define <16 x i32> @stack_fold_valignd_maskz(<16 x i32> %a, <16 x i32> %b, i16 %m } define <8 x i64> @stack_fold_valignq(<8 x i64> %a, <8 x i64> %b) { - ;CHECK-LABEL: stack_fold_valignq - ;CHECK: valignq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_valignq: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7],zmm0[0] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> ret <8 x i64> %2 } define <8 x i64> @stack_fold_valignq_mask(<8 x i64> %a, <8 x i64> %b, <8 x i64>* %passthru, i8 %mask) { - ;CHECK-LABEL: stack_fold_valignq_mask - ;CHECK: valignq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_valignq_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7],zmm0[0] +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> %3 = bitcast i8 %mask to <8 x i1> @@ -57,8 +130,22 @@ define <8 x i64> @stack_fold_valignq_mask(<8 x i64> %a, <8 x i64> %b, <8 x i64>* } define <8 x i64> @stack_fold_valignq_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask) { - ;CHECK-LABEL: stack_fold_valignq_maskz - ;CHECK: valignq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_valignq_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7],zmm0[0] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> %3 = bitcast i8 %mask to <8 x i1> @@ -67,8 +154,14 @@ define <8 x i64> @stack_fold_valignq_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask) } define <64 x i8> @stack_fold_pavgb(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pavgb - ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pavgb: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1) ret <64 x i8> %2 @@ -76,16 +169,31 @@ define <64 x i8> @stack_fold_pavgb(<64 x i8> %a0, <64 x i8> %a1) { declare <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8>, <64 x i8>) define <64 x i8> @stack_fold_pavgb_commuted(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pavgb_commuted: - ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pavgb_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0) ret <64 x i8> %2 } define <64 x i8> @stack_fold_pavgb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { - ;CHECK-LABEL: stack_fold_pavgb_mask - ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pavgb_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1) %3 = bitcast i64 %mask to <64 x i1> @@ -96,8 +204,17 @@ define <64 x i8> @stack_fold_pavgb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* } define <64 x i8> @stack_fold_pavgb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { - ;CHECK-LABEL: stack_fold_pavgb_mask_commuted: - ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pavgb_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0) %3 = bitcast i64 %mask to <64 x i1> @@ -108,8 +225,15 @@ define <64 x i8> @stack_fold_pavgb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, < } define <64 x i8> @stack_fold_pavgb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_pavgb_maskz - ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pavgb_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1) %3 = bitcast i64 %mask to <64 x i1> @@ -118,8 +242,15 @@ define <64 x i8> @stack_fold_pavgb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask } define <64 x i8> @stack_fold_pavgb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_pavgb_maskz_commuted: - ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pavgb_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0) %3 = bitcast i64 %mask to <64 x i1> @@ -128,8 +259,14 @@ define <64 x i8> @stack_fold_pavgb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, } define <32 x i16> @stack_fold_pavgw(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_pavgw - ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pavgw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1) ret <32 x i16> %2 @@ -137,16 +274,31 @@ define <32 x i16> @stack_fold_pavgw(<32 x i16> %a0, <32 x i16> %a1) { declare <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16>, <32 x i16>) define <32 x i16> @stack_fold_pavgw_commuted(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_pavgw_commuted: - ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pavgw_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0) ret <32 x i16> %2 } define <32 x i16> @stack_fold_pavgw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { - ;CHECK-LABEL: stack_fold_pavgw_mask - ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pavgw_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1) %3 = bitcast i32 %mask to <32 x i1> @@ -157,8 +309,17 @@ define <32 x i16> @stack_fold_pavgw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i } define <32 x i16> @stack_fold_pavgw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { - ;CHECK-LABEL: stack_fold_pavgw_mask_commuted: - ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pavgw_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0) %3 = bitcast i32 %mask to <32 x i1> @@ -169,8 +330,15 @@ define <32 x i16> @stack_fold_pavgw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1 } define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_pavgw_maskz - ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pavgw_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1) %3 = bitcast i32 %mask to <32 x i1> @@ -179,8 +347,15 @@ define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %m } define <32 x i16> @stack_fold_pavgw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_pavgw_maskz_commuted: - ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pavgw_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0) %3 = bitcast i32 %mask to <32 x i1> @@ -189,8 +364,16 @@ define <32 x i16> @stack_fold_pavgw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a } define <4 x i32> @stack_fold_extracti32x4(<16 x i16> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_extracti32x4 - ;CHECK: vextracti32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill +; CHECK-LABEL: stack_fold_extracti32x4: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; CHECK-NEXT: vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq ; zext forces execution domain %1 = zext <16 x i16> %a0 to <16 x i32> %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15> @@ -199,8 +382,16 @@ define <4 x i32> @stack_fold_extracti32x4(<16 x i16> %a0, <16 x i32> %a1) { } define <2 x i64> @stack_fold_extracti64x2(<8 x i32> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_extracti64x2 - ;CHECK: vextracti32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill +; CHECK-LABEL: stack_fold_extracti64x2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; CHECK-NEXT: vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq ; zext forces execution domain %1 = zext <8 x i32> %a0 to <8 x i64> %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <2 x i32> <i32 6, i32 7> @@ -209,8 +400,15 @@ define <2 x i64> @stack_fold_extracti64x2(<8 x i32> %a0, <8 x i64> %a1) { } define <8 x i32> @stack_fold_extracti32x8(<16 x i16> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_extracti32x8 - ;CHECK: vextracti64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill +; CHECK-LABEL: stack_fold_extracti32x8: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; CHECK-NEXT: vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: retq ; zext forces execution domain %1 = zext <16 x i16> %a0 to <16 x i32> %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> @@ -219,8 +417,15 @@ define <8 x i32> @stack_fold_extracti32x8(<16 x i16> %a0, <16 x i32> %a1) { } define <4 x i64> @stack_fold_extracti64x4(<8 x i32> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_extracti64x4 - ;CHECK: vextracti64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill +; CHECK-LABEL: stack_fold_extracti64x4: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; CHECK-NEXT: vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: retq ; zext forces execution domain %1 = zext <8 x i32> %a0 to <8 x i64> %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -229,8 +434,17 @@ define <4 x i64> @stack_fold_extracti64x4(<8 x i32> %a0, <8 x i64> %a1) { } define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) { - ;CHECK-LABEL: stack_fold_inserti32x8 - ;CHECK: vinserti64x4 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload +; CHECK-LABEL: stack_fold_inserti32x8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> ; add forces execution domain @@ -239,8 +453,17 @@ define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) { } define <8 x i64> @stack_fold_inserti64x4(<4 x i64> %a0, <4 x i64> %a1) { - ;CHECK-LABEL: stack_fold_inserti64x4 - ;CHECK: vinserti64x4 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload +; CHECK-LABEL: stack_fold_inserti64x4: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; add forces execution domain @@ -249,8 +472,14 @@ define <8 x i64> @stack_fold_inserti64x4(<4 x i64> %a0, <4 x i64> %a1) { } define <64 x i8> @stack_fold_pabsb(<64 x i8> %a0) { - ;CHECK-LABEL: stack_fold_pabsb - ;CHECK: vpabsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pabsb: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <64 x i8> %a0, zeroinitializer %3 = sub <64 x i8> zeroinitializer, %a0 @@ -259,8 +488,21 @@ define <64 x i8> @stack_fold_pabsb(<64 x i8> %a0) { } define <64 x i8> @stack_fold_pabsb_mask(<64 x i8> %passthru, <64 x i8> %a0, i64 %mask) { - ;CHECK-LABEL: stack_fold_pabsb_mask - ;CHECK: vpabsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pabsb_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <64 x i8> %a0, zeroinitializer %3 = sub <64 x i8> zeroinitializer, %a0 @@ -271,8 +513,15 @@ define <64 x i8> @stack_fold_pabsb_mask(<64 x i8> %passthru, <64 x i8> %a0, i64 } define <64 x i8> @stack_fold_pabsb_maskz(<64 x i8> %a0, i64 %mask) { - ;CHECK-LABEL: stack_fold_pabsb_maskz - ;CHECK: vpabsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pabsb_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <64 x i8> %a0, zeroinitializer %3 = sub <64 x i8> zeroinitializer, %a0 @@ -283,8 +532,14 @@ define <64 x i8> @stack_fold_pabsb_maskz(<64 x i8> %a0, i64 %mask) { } define <16 x i32> @stack_fold_pabsd(<16 x i32> %a0) { - ;CHECK-LABEL: stack_fold_pabsd - ;CHECK: vpabsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pabsd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <16 x i32> %a0, zeroinitializer %3 = sub <16 x i32> zeroinitializer, %a0 @@ -293,8 +548,21 @@ define <16 x i32> @stack_fold_pabsd(<16 x i32> %a0) { } define <16 x i32> @stack_fold_pabsd_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) { - ;CHECK-LABEL: stack_fold_pabsd_mask - ;CHECK: vpabsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pabsd_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <16 x i32> %a0, zeroinitializer %3 = sub <16 x i32> zeroinitializer, %a0 @@ -305,8 +573,15 @@ define <16 x i32> @stack_fold_pabsd_mask(<16 x i32> %passthru, <16 x i32> %a0, i } define <16 x i32> @stack_fold_pabsd_maskz(<16 x i32> %a0, i16 %mask) { - ;CHECK-LABEL: stack_fold_pabsd_maskz - ;CHECK: vpabsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pabsd_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <16 x i32> %a0, zeroinitializer %3 = sub <16 x i32> zeroinitializer, %a0 @@ -317,8 +592,14 @@ define <16 x i32> @stack_fold_pabsd_maskz(<16 x i32> %a0, i16 %mask) { } define <8 x i64> @stack_fold_pabsq(<8 x i64> %a0) { - ;CHECK-LABEL: stack_fold_pabsq - ;CHECK: vpabsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pabsq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <8 x i64> %a0, zeroinitializer %3 = sub <8 x i64> zeroinitializer, %a0 @@ -327,8 +608,21 @@ define <8 x i64> @stack_fold_pabsq(<8 x i64> %a0) { } define <8 x i64> @stack_fold_pabsq_mask(<8 x i64> %passthru, <8 x i64> %a0, i8 %mask) { - ;CHECK-LABEL: stack_fold_pabsq_mask - ;CHECK: vpabsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pabsq_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <8 x i64> %a0, zeroinitializer %3 = sub <8 x i64> zeroinitializer, %a0 @@ -339,8 +633,15 @@ define <8 x i64> @stack_fold_pabsq_mask(<8 x i64> %passthru, <8 x i64> %a0, i8 % } define <8 x i64> @stack_fold_pabsq_maskz(<8 x i64> %a0, i8 %mask) { - ;CHECK-LABEL: stack_fold_pabsq_maskz - ;CHECK: vpabsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pabsq_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <8 x i64> %a0, zeroinitializer %3 = sub <8 x i64> zeroinitializer, %a0 @@ -351,8 +652,14 @@ define <8 x i64> @stack_fold_pabsq_maskz(<8 x i64> %a0, i8 %mask) { } define <32 x i16> @stack_fold_pabsw(<32 x i16> %a0) { - ;CHECK-LABEL: stack_fold_pabsw - ;CHECK: vpabsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pabsw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <32 x i16> %a0, zeroinitializer %3 = sub <32 x i16> zeroinitializer, %a0 @@ -361,8 +668,21 @@ define <32 x i16> @stack_fold_pabsw(<32 x i16> %a0) { } define <32 x i16> @stack_fold_pabsw_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) { - ;CHECK-LABEL: stack_fold_pabsw_mask - ;CHECK: vpabsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pabsw_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <32 x i16> %a0, zeroinitializer %3 = sub <32 x i16> zeroinitializer, %a0 @@ -373,8 +693,15 @@ define <32 x i16> @stack_fold_pabsw_mask(<32 x i16> %passthru, <32 x i16> %a0, i } define <32 x i16> @stack_fold_pabsw_maskz(<32 x i16> %a0, i32 %mask) { - ;CHECK-LABEL: stack_fold_pabsw_maskz - ;CHECK: vpabsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pabsw_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <32 x i16> %a0, zeroinitializer %3 = sub <32 x i16> zeroinitializer, %a0 @@ -385,8 +712,14 @@ define <32 x i16> @stack_fold_pabsw_maskz(<32 x i16> %a0, i32 %mask) { } define <32 x i16> @stack_fold_packssdw(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_packssdw - ;CHECK: vpackssdw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_packssdw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a0, <16 x i32> %a1) ret <32 x i16> %2 @@ -394,8 +727,14 @@ define <32 x i16> @stack_fold_packssdw(<16 x i32> %a0, <16 x i32> %a1) { declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) nounwind readnone define <64 x i8> @stack_fold_packsswb(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_packsswb - ;CHECK: vpacksswb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_packsswb: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a0, <32 x i16> %a1) ret <64 x i8> %2 @@ -403,8 +742,14 @@ define <64 x i8> @stack_fold_packsswb(<32 x i16> %a0, <32 x i16> %a1) { declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) nounwind readnone define <32 x i16> @stack_fold_packusdw(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_packusdw - ;CHECK: vpackusdw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_packusdw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1) ret <32 x i16> %2 @@ -412,8 +757,17 @@ define <32 x i16> @stack_fold_packusdw(<16 x i32> %a0, <16 x i32> %a1) { declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) nounwind readnone define <32 x i16> @stack_fold_packusdw_mask(<32 x i16>* %passthru, <16 x i32> %a0, <16 x i32> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_packusdw_mask - ;CHECK: vpackusdw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_packusdw_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = load <32 x i16>, <32 x i16>* %passthru %3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1) @@ -423,8 +777,15 @@ define <32 x i16> @stack_fold_packusdw_mask(<32 x i16>* %passthru, <16 x i32> %a } define <32 x i16> @stack_fold_packusdw_maskz(<16 x i32> %a0, <16 x i32> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_packusdw_maskz - ;CHECK: vpackusdw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_packusdw_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1) %3 = bitcast i32 %mask to <32 x i1> @@ -433,8 +794,14 @@ define <32 x i16> @stack_fold_packusdw_maskz(<16 x i32> %a0, <16 x i32> %a1, i32 } define <64 x i8> @stack_fold_packuswb(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_packuswb - ;CHECK: vpackuswb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_packuswb: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a0, <32 x i16> %a1) ret <64 x i8> %2 @@ -442,24 +809,45 @@ define <64 x i8> @stack_fold_packuswb(<32 x i16> %a0, <32 x i16> %a1) { declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) nounwind readnone define <64 x i8> @stack_fold_paddb(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_paddb - ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddb: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <64 x i8> %a0, %a1 ret <64 x i8> %2 } define <64 x i8> @stack_fold_paddb_commuted(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_paddb_commuted: - ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddb_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <64 x i8> %a1, %a0 ret <64 x i8> %2 } define <64 x i8> @stack_fold_paddb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { - ;CHECK-LABEL: stack_fold_paddb_mask - ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddb_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <64 x i8> %a0, %a1 %3 = bitcast i64 %mask to <64 x i1> @@ -470,8 +858,17 @@ define <64 x i8> @stack_fold_paddb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* } define <64 x i8> @stack_fold_paddb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { - ;CHECK-LABEL: stack_fold_paddb_mask_commuted: - ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddb_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <64 x i8> %a1, %a0 %3 = bitcast i64 %mask to <64 x i1> @@ -482,8 +879,15 @@ define <64 x i8> @stack_fold_paddb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, < } define <64 x i8> @stack_fold_paddb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_paddb_maskz - ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddb_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <64 x i8> %a0, %a1 %3 = bitcast i64 %mask to <64 x i1> @@ -492,8 +896,15 @@ define <64 x i8> @stack_fold_paddb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask } define <64 x i8> @stack_fold_paddb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_paddb_maskz_commuted: - ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddb_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <64 x i8> %a1, %a0 %3 = bitcast i64 %mask to <64 x i1> @@ -502,24 +913,45 @@ define <64 x i8> @stack_fold_paddb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, } define <16 x i32> @stack_fold_paddd(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_paddd - ;CHECK: vpaddd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <16 x i32> %a0, %a1 ret <16 x i32> %2 } define <16 x i32> @stack_fold_paddd_commuted(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_paddd_commuted: - ;CHECK: vpaddd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddd_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <16 x i32> %a1, %a0 ret <16 x i32> %2 } define <16 x i32> @stack_fold_paddd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { - ;CHECK-LABEL: stack_fold_paddd_mask - ;CHECK: vpaddd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddd_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <16 x i32> %a0, %a1 %3 = bitcast i16 %mask to <16 x i1> @@ -530,8 +962,17 @@ define <16 x i32> @stack_fold_paddd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i } define <16 x i32> @stack_fold_paddd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { - ;CHECK-LABEL: stack_fold_paddd_mask_commuted: - ;CHECK: vpaddd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddd_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <16 x i32> %a1, %a0 %3 = bitcast i16 %mask to <16 x i1> @@ -542,8 +983,15 @@ define <16 x i32> @stack_fold_paddd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1 } define <16 x i32> @stack_fold_paddd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_paddd_maskz - ;CHECK: vpaddd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddd_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <16 x i32> %a0, %a1 %3 = bitcast i16 %mask to <16 x i1> @@ -552,8 +1000,15 @@ define <16 x i32> @stack_fold_paddd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %m } define <16 x i32> @stack_fold_paddd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_paddd_maskz_commuted: - ;CHECK: vpaddd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddd_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <16 x i32> %a1, %a0 %3 = bitcast i16 %mask to <16 x i1> @@ -562,24 +1017,45 @@ define <16 x i32> @stack_fold_paddd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a } define <8 x i64> @stack_fold_paddq(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_paddq - ;CHECK: vpaddq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <8 x i64> %a0, %a1 ret <8 x i64> %2 } define <8 x i64> @stack_fold_paddq_commuted(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_paddq_commuted: - ;CHECK: vpaddq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddq_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <8 x i64> %a1, %a0 ret <8 x i64> %2 } define <8 x i64> @stack_fold_paddq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { - ;CHECK-LABEL: stack_fold_paddq_mask - ;CHECK: vpaddq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddq_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <8 x i64> %a0, %a1 %3 = bitcast i8 %mask to <8 x i1> @@ -590,8 +1066,17 @@ define <8 x i64> @stack_fold_paddq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* } define <8 x i64> @stack_fold_paddq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { - ;CHECK-LABEL: stack_fold_paddq_mask_commuted: - ;CHECK: vpaddq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddq_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <8 x i64> %a1, %a0 %3 = bitcast i8 %mask to <8 x i1> @@ -602,8 +1087,15 @@ define <8 x i64> @stack_fold_paddq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, < } define <8 x i64> @stack_fold_paddq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_paddq_maskz - ;CHECK: vpaddq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddq_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <8 x i64> %a0, %a1 %3 = bitcast i8 %mask to <8 x i1> @@ -612,8 +1104,15 @@ define <8 x i64> @stack_fold_paddq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) } define <8 x i64> @stack_fold_paddq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_paddq_maskz_commuted: - ;CHECK: vpaddq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddq_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <8 x i64> %a1, %a0 %3 = bitcast i8 %mask to <8 x i1> @@ -622,24 +1121,45 @@ define <8 x i64> @stack_fold_paddq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, } define <64 x i8> @stack_fold_paddsb(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_paddsb - ;CHECK: vpaddsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddsb: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) ret <64 x i8> %2 } define <64 x i8> @stack_fold_paddsb_commuted(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_paddsb_commuted: - ;CHECK: vpaddsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddsb_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) ret <64 x i8> %2 } define <64 x i8> @stack_fold_paddsb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { - ;CHECK-LABEL: stack_fold_paddsb_mask - ;CHECK: vpaddsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddsb_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) %3 = bitcast i64 %mask to <64 x i1> @@ -650,8 +1170,17 @@ define <64 x i8> @stack_fold_paddsb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> } define <64 x i8> @stack_fold_paddsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { - ;CHECK-LABEL: stack_fold_paddsb_mask_commuted: - ;CHECK: vpaddsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddsb_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) %3 = bitcast i64 %mask to <64 x i1> @@ -662,8 +1191,15 @@ define <64 x i8> @stack_fold_paddsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, } define <64 x i8> @stack_fold_paddsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_paddsb_maskz - ;CHECK: vpaddsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddsb_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) %3 = bitcast i64 %mask to <64 x i1> @@ -672,8 +1208,15 @@ define <64 x i8> @stack_fold_paddsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mas } define <64 x i8> @stack_fold_paddsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_paddsb_maskz_commuted: - ;CHECK: vpaddsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddsb_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) %3 = bitcast i64 %mask to <64 x i1> @@ -682,24 +1225,45 @@ define <64 x i8> @stack_fold_paddsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, } define <32 x i16> @stack_fold_paddsw(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_paddsw - ;CHECK: vpaddsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddsw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) ret <32 x i16> %2 } define <32 x i16> @stack_fold_paddsw_commuted(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_paddsw_commuted: - ;CHECK: vpaddsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddsw_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) ret <32 x i16> %2 } define <32 x i16> @stack_fold_paddsw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { - ;CHECK-LABEL: stack_fold_paddsw_mask - ;CHECK: vpaddsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddsw_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) %3 = bitcast i32 %mask to <32 x i1> @@ -710,8 +1274,17 @@ define <32 x i16> @stack_fold_paddsw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x } define <32 x i16> @stack_fold_paddsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { - ;CHECK-LABEL: stack_fold_paddsw_mask_commuted: - ;CHECK: vpaddsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddsw_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) %3 = bitcast i32 %mask to <32 x i1> @@ -722,8 +1295,15 @@ define <32 x i16> @stack_fold_paddsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a } define <32 x i16> @stack_fold_paddsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_paddsw_maskz - ;CHECK: vpaddsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddsw_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) %3 = bitcast i32 %mask to <32 x i1> @@ -732,8 +1312,15 @@ define <32 x i16> @stack_fold_paddsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 % } define <32 x i16> @stack_fold_paddsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_paddsw_maskz_commuted: - ;CHECK: vpaddsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddsw_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) %3 = bitcast i32 %mask to <32 x i1> @@ -742,24 +1329,45 @@ define <32 x i16> @stack_fold_paddsw_maskz_commuted(<32 x i16> %a0, <32 x i16> % } define <64 x i8> @stack_fold_paddusb(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_paddusb - ;CHECK: vpaddusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddusb: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) ret <64 x i8> %2 } define <64 x i8> @stack_fold_paddusb_commuted(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_paddusb_commuted: - ;CHECK: vpaddusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddusb_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) ret <64 x i8> %2 } define <64 x i8> @stack_fold_paddusb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { - ;CHECK-LABEL: stack_fold_paddusb_mask - ;CHECK: vpaddusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddusb_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) %3 = bitcast i64 %mask to <64 x i1> @@ -770,8 +1378,17 @@ define <64 x i8> @stack_fold_paddusb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8 } define <64 x i8> @stack_fold_paddusb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { - ;CHECK-LABEL: stack_fold_paddusb_mask_commuted: - ;CHECK: vpaddusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddusb_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) %3 = bitcast i64 %mask to <64 x i1> @@ -782,8 +1399,15 @@ define <64 x i8> @stack_fold_paddusb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, } define <64 x i8> @stack_fold_paddusb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_paddusb_maskz - ;CHECK: vpaddusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddusb_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) %3 = bitcast i64 %mask to <64 x i1> @@ -792,8 +1416,15 @@ define <64 x i8> @stack_fold_paddusb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %ma } define <64 x i8> @stack_fold_paddusb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_paddusb_maskz_commuted: - ;CHECK: vpaddusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddusb_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) %3 = bitcast i64 %mask to <64 x i1> @@ -802,24 +1433,45 @@ define <64 x i8> @stack_fold_paddusb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1 } define <32 x i16> @stack_fold_paddusw(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_paddusw - ;CHECK: vpaddusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddusw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) ret <32 x i16> %2 } define <32 x i16> @stack_fold_paddusw_commuted(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_paddusw_commuted: - ;CHECK: vpaddusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddusw_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) ret <32 x i16> %2 } define <32 x i16> @stack_fold_paddusw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { - ;CHECK-LABEL: stack_fold_paddusw_mask - ;CHECK: vpaddusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddusw_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) %3 = bitcast i32 %mask to <32 x i1> @@ -830,8 +1482,17 @@ define <32 x i16> @stack_fold_paddusw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x } define <32 x i16> @stack_fold_paddusw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { - ;CHECK-LABEL: stack_fold_paddusw_mask_commuted: - ;CHECK: vpaddusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddusw_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) %3 = bitcast i32 %mask to <32 x i1> @@ -842,8 +1503,15 @@ define <32 x i16> @stack_fold_paddusw_mask_commuted(<32 x i16> %a0, <32 x i16> % } define <32 x i16> @stack_fold_paddusw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_paddusw_maskz - ;CHECK: vpaddusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddusw_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) %3 = bitcast i32 %mask to <32 x i1> @@ -852,8 +1520,15 @@ define <32 x i16> @stack_fold_paddusw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 } define <32 x i16> @stack_fold_paddusw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_paddusw_maskz_commuted: - ;CHECK: vpaddusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddusw_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) %3 = bitcast i32 %mask to <32 x i1> @@ -862,24 +1537,45 @@ define <32 x i16> @stack_fold_paddusw_maskz_commuted(<32 x i16> %a0, <32 x i16> } define <32 x i16> @stack_fold_paddw(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_paddw - ;CHECK: vpaddw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <32 x i16> %a0, %a1 ret <32 x i16> %2 } define <32 x i16> @stack_fold_paddw_commuted(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_paddw_commuted: - ;CHECK: vpaddw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddw_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <32 x i16> %a1, %a0 ret <32 x i16> %2 } define <32 x i16> @stack_fold_paddw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { - ;CHECK-LABEL: stack_fold_paddw_mask - ;CHECK: vpaddw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddw_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <32 x i16> %a0, %a1 %3 = bitcast i32 %mask to <32 x i1> @@ -890,8 +1586,17 @@ define <32 x i16> @stack_fold_paddw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i } define <32 x i16> @stack_fold_paddw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { - ;CHECK-LABEL: stack_fold_paddw_mask_commuted: - ;CHECK: vpaddw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddw_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <32 x i16> %a1, %a0 %3 = bitcast i32 %mask to <32 x i1> @@ -902,8 +1607,15 @@ define <32 x i16> @stack_fold_paddw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1 } define <32 x i16> @stack_fold_paddw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_paddw_maskz - ;CHECK: vpaddw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddw_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <32 x i16> %a0, %a1 %3 = bitcast i32 %mask to <32 x i1> @@ -912,8 +1624,15 @@ define <32 x i16> @stack_fold_paddw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %m } define <32 x i16> @stack_fold_paddw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_paddw_maskz_commuted: - ;CHECK: vpaddw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_paddw_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = add <32 x i16> %a1, %a0 %3 = bitcast i32 %mask to <32 x i1> @@ -922,16 +1641,45 @@ define <32 x i16> @stack_fold_paddw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a } define <64 x i8> @stack_fold_palignr(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_palignr - ;CHECK: vpalignr $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_palignr: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112> ret <64 x i8> %2 } define <64 x i8> @stack_fold_palignr_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %passthru, i64 %mask) { - ;CHECK-LABEL: stack_fold_palignr_mask - ;CHECK: vpalignr $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_palignr_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48] +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112> %3 = bitcast i64 %mask to <64 x i1> @@ -941,8 +1689,22 @@ define <64 x i8> @stack_fold_palignr_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8 } define <64 x i8> @stack_fold_palignr_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_palignr_maskz - ;CHECK: vpalignr $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_palignr_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112> %3 = bitcast i64 %mask to <64 x i1> @@ -951,24 +1713,45 @@ define <64 x i8> @stack_fold_palignr_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %ma } define <16 x i32> @stack_fold_pandd(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pandd - ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pandd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <16 x i32> %a0, %a1 ret <16 x i32> %2 } define <16 x i32> @stack_fold_pandd_commuted(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pandd_commuted: - ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pandd_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <16 x i32> %a1, %a0 ret <16 x i32> %2 } define <16 x i32> @stack_fold_pandd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { - ;CHECK-LABEL: stack_fold_pandd_mask - ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pandd_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovaps %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <16 x i32> %a0, %a1 %3 = bitcast i16 %mask to <16 x i1> @@ -979,8 +1762,17 @@ define <16 x i32> @stack_fold_pandd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i } define <16 x i32> @stack_fold_pandd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { - ;CHECK-LABEL: stack_fold_pandd_mask_commuted: - ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pandd_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovaps %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <16 x i32> %a1, %a0 %3 = bitcast i16 %mask to <16 x i1> @@ -991,8 +1783,15 @@ define <16 x i32> @stack_fold_pandd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1 } define <16 x i32> @stack_fold_pandd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pandd_maskz - ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pandd_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <16 x i32> %a0, %a1 %3 = bitcast i16 %mask to <16 x i1> @@ -1001,8 +1800,15 @@ define <16 x i32> @stack_fold_pandd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %m } define <16 x i32> @stack_fold_pandd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pandd_maskz_commuted: - ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pandd_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <16 x i32> %a1, %a0 %3 = bitcast i16 %mask to <16 x i1> @@ -1011,24 +1817,45 @@ define <16 x i32> @stack_fold_pandd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a } define <8 x i64> @stack_fold_pandq(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pandq - ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pandq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a0, %a1 ret <8 x i64> %2 } define <8 x i64> @stack_fold_pandq_commuted(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pandq_commuted: - ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pandq_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a1, %a0 ret <8 x i64> %2 } define <8 x i64> @stack_fold_pandq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { - ;CHECK-LABEL: stack_fold_pandq_mask - ;CHECK: vandpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pandq_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovapd %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovapd (%rdi), %zmm0 +; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a0, %a1 %3 = bitcast i8 %mask to <8 x i1> @@ -1039,8 +1866,17 @@ define <8 x i64> @stack_fold_pandq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* } define <8 x i64> @stack_fold_pandq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { - ;CHECK-LABEL: stack_fold_pandq_mask_commuted: - ;CHECK: vandpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pandq_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovapd %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovapd (%rdi), %zmm0 +; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a1, %a0 %3 = bitcast i8 %mask to <8 x i1> @@ -1051,8 +1887,15 @@ define <8 x i64> @stack_fold_pandq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, < } define <8 x i64> @stack_fold_pandq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pandq_maskz - ;CHECK: vandpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pandq_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a0, %a1 %3 = bitcast i8 %mask to <8 x i1> @@ -1061,8 +1904,15 @@ define <8 x i64> @stack_fold_pandq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) } define <8 x i64> @stack_fold_pandq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pandq_maskz_commuted: - ;CHECK: vandpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pandq_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a1, %a0 %3 = bitcast i8 %mask to <8 x i1> @@ -1071,8 +1921,14 @@ define <8 x i64> @stack_fold_pandq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, } define <16 x i32> @stack_fold_vpconflictd(<16 x i32> %a0) { - ;CHECK-LABEL: stack_fold_vpconflictd - ;CHECK: vpconflictd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_vpconflictd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %a0) ret <16 x i32> %2 @@ -1080,8 +1936,14 @@ define <16 x i32> @stack_fold_vpconflictd(<16 x i32> %a0) { declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly define <8 x i64> @stack_fold_vpconflictq(<8 x i64> %a0) { - ;CHECK-LABEL: stack_fold_vpconflictq - ;CHECK: vpconflictq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_vpconflictq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64> %a0) ret <8 x i64> %2 @@ -1089,8 +1951,16 @@ define <8 x i64> @stack_fold_vpconflictq(<8 x i64> %a0) { declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readnone define i64 @stack_fold_pcmpeqb(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pcmpeqb - ;CHECK: vpcmpeqb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pcmpeqb: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp eq <64 x i8> %a0, %a1 %3 = bitcast <64 x i1> %2 to i64 @@ -1098,8 +1968,17 @@ define i64 @stack_fold_pcmpeqb(<64 x i8> %a0, <64 x i8> %a1) { } define i16 @stack_fold_pcmpeqd(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pcmpeqd - ;CHECK: vpcmpeqd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pcmpeqd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp eq <16 x i32> %a0, %a1 %3 = bitcast <16 x i1> %2 to i16 @@ -1107,8 +1986,17 @@ define i16 @stack_fold_pcmpeqd(<16 x i32> %a0, <16 x i32> %a1) { } define i8 @stack_fold_pcmpeqq(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pcmpeqq - ;CHECK: vpcmpeqq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pcmpeqq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp eq <8 x i64> %a0, %a1 %3 = bitcast <8 x i1> %2 to i8 @@ -1116,8 +2004,16 @@ define i8 @stack_fold_pcmpeqq(<8 x i64> %a0, <8 x i64> %a1) { } define i32 @stack_fold_pcmpeqw(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_pcmpeqw - ;CHECK: vpcmpeqw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pcmpeqw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp eq <32 x i16> %a0, %a1 %3 = bitcast <32 x i1> %2 to i32 @@ -1125,8 +2021,26 @@ define i32 @stack_fold_pcmpeqw(<32 x i16> %a0, <32 x i16> %a1) { } define <16 x i32> @stack_fold_pcmpeqd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) { - ;CHECK-LABEL: stack_fold_pcmpeqd_mask - ;CHECK: vpcmpeqd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pcmpeqd_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $184, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 192 +; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: addq $184, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load %2 = load <16 x i32>, <16 x i32>* %a2 @@ -1139,8 +2053,26 @@ define <16 x i32> @stack_fold_pcmpeqd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x } define <16 x i32> @stack_fold_pcmpeqd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) { - ;CHECK-LABEL: stack_fold_pcmpeqd_mask_commuted - ;CHECK: vpcmpeqd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pcmpeqd_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $184, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 192 +; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: addq $184, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load %2 = load <16 x i32>, <16 x i32>* %a2 @@ -1153,8 +2085,26 @@ define <16 x i32> @stack_fold_pcmpeqd_mask_commuted(<16 x i32> %a0, <16 x i32> % } define <16 x i32> @stack_fold_pcmpled_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) { - ;CHECK-LABEL: stack_fold_pcmpled_mask - ;CHECK: vpcmpled {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pcmpled_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $184, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 192 +; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpcmpled {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: addq $184, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load %2 = load <16 x i32>, <16 x i32>* %a2 @@ -1167,8 +2117,25 @@ define <16 x i32> @stack_fold_pcmpled_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x } define i16 @stack_fold_pcmpleud(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { - ;CHECK-LABEL: stack_fold_pcmpleud - ;CHECK: vpcmpleud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pcmpleud: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: vpcmpleud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: andl %esi, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = load <16 x i32>, <16 x i32>* %a2 %3 = add <16 x i32> %a1, %2 @@ -1180,8 +2147,14 @@ define i16 @stack_fold_pcmpleud(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, } define <64 x i8> @stack_fold_permbvar(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_permbvar - ;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_permbvar: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0) ret <64 x i8> %2 @@ -1189,8 +2162,17 @@ define <64 x i8> @stack_fold_permbvar(<64 x i8> %a0, <64 x i8> %a1) { declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) nounwind readonly define <64 x i8> @stack_fold_permbvar_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_permbvar_mask - ;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_permbvar_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0) %3 = bitcast i64 %mask to <64 x i1> @@ -1201,8 +2183,15 @@ define <64 x i8> @stack_fold_permbvar_mask(<64 x i8>* %passthru, <64 x i8> %a0, } define <64 x i8> @stack_fold_permbvar_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_permbvar_maskz - ;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_permbvar_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0) %3 = bitcast i64 %mask to <64 x i1> @@ -1211,8 +2200,16 @@ define <64 x i8> @stack_fold_permbvar_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %m } define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_permd - ;CHECK: vpermd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_permd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a1, <16 x i32> %a0) ; add forces execution domain @@ -1222,40 +2219,73 @@ define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) { declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) nounwind readonly define <64 x i8> @stack_fold_vpermi2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { - ;CHECK-LABEL: stack_fold_vpermi2b - ;CHECK: vpermi2b {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-LABEL: stack_fold_vpermi2b: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2) ret <64 x i8> %2 } define <16 x i32> @stack_fold_vpermi2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { - ;CHECK-LABEL: stack_fold_vpermi2d - ;CHECK: vpermi2d {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-LABEL: stack_fold_vpermi2d: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) ret <16 x i32> %2 } define <8 x i64> @stack_fold_vpermi2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { - ;CHECK-LABEL: stack_fold_vpermi2q - ;CHECK: vpermi2q {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-LABEL: stack_fold_vpermi2q: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2) ret <8 x i64> %2 } define <32 x i16> @stack_fold_vpermi2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { - ;CHECK-LABEL: stack_fold_vpermi2w - ;CHECK: vpermi2w {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-LABEL: stack_fold_vpermi2w: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) ret <32 x i16> %2 } define <8 x i64> @stack_fold_permq(<8 x i64> %a0) { - ;CHECK-LABEL: stack_fold_permq - ;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_permq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 = mem[3,2,2,3,7,6,6,7] +; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> ; add forces execution domain @@ -1264,8 +2294,19 @@ define <8 x i64> @stack_fold_permq(<8 x i64> %a0) { } define <8 x i64> @stack_fold_permq_mask(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) { - ;CHECK-LABEL: stack_fold_permq_mask - ;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_permq_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,2,3,7,6,6,7] +; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> %3 = bitcast i8 %mask to <8 x i1> @@ -1278,8 +2319,16 @@ define <8 x i64> @stack_fold_permq_mask(<8 x i64>* %passthru, <8 x i64> %a0, i8 } define <8 x i64> @stack_fold_permq_maskz(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) { - ;CHECK-LABEL: stack_fold_permq_maskz - ;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_permq_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,2,3,7,6,6,7] +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> %3 = bitcast i8 %mask to <8 x i1> @@ -1288,8 +2337,16 @@ define <8 x i64> @stack_fold_permq_maskz(<8 x i64>* %passthru, <8 x i64> %a0, i8 } define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_permqvar - ;CHECK: vpermq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_permqvar: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0) ; add forces execution domain @@ -1299,8 +2356,18 @@ define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) { declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) nounwind readonly define <8 x i64> @stack_fold_permqvar_mask(<8 x i64>* %passthru, <8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_permqvar_mask - ;CHECK: vpermq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_permqvar_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 +; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vpsubq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0) %3 = bitcast i8 %mask to <8 x i1> @@ -1313,8 +2380,14 @@ define <8 x i64> @stack_fold_permqvar_mask(<8 x i64>* %passthru, <8 x i64> %a0, } define <64 x i8> @stack_fold_vpermt2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { - ;CHECK-LABEL: stack_fold_vpermt2b - ;CHECK: vpermt2b {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-LABEL: stack_fold_vpermt2b: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) ret <64 x i8> %2 @@ -1322,8 +2395,14 @@ define <64 x i8> @stack_fold_vpermt2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x declare <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>) define <16 x i32> @stack_fold_vpermt2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { - ;CHECK-LABEL: stack_fold_vpermt2d - ;CHECK: vpermt2d {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-LABEL: stack_fold_vpermt2d: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) ret <16 x i32> %2 @@ -1331,8 +2410,14 @@ define <16 x i32> @stack_fold_vpermt2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32 declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>) define <8 x i64> @stack_fold_vpermt2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { - ;CHECK-LABEL: stack_fold_vpermt2q - ;CHECK: vpermt2q {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-LABEL: stack_fold_vpermt2q: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) ret <8 x i64> %2 @@ -1340,8 +2425,14 @@ define <8 x i64> @stack_fold_vpermt2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>) define <32 x i16> @stack_fold_vpermt2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { - ;CHECK-LABEL: stack_fold_vpermt2w - ;CHECK: vpermt2w {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-LABEL: stack_fold_vpermt2w: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) ret <32 x i16> %2 @@ -1349,8 +2440,14 @@ define <32 x i16> @stack_fold_vpermt2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16 declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>) define <32 x i16> @stack_fold_permwvar(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_permwvar - ;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_permwvar: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0) ret <32 x i16> %2 @@ -1358,8 +2455,17 @@ define <32 x i16> @stack_fold_permwvar(<32 x i16> %a0, <32 x i16> %a1) { declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) nounwind readonly define <32 x i16> @stack_fold_permwvar_mask(<32 x i16>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_permwvar_mask - ;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_permwvar_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0) %3 = bitcast i32 %mask to <32 x i1> @@ -1370,8 +2476,15 @@ define <32 x i16> @stack_fold_permwvar_mask(<32 x i16>* %passthru, <32 x i16> %a } define <32 x i16> @stack_fold_permwvar_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_permwvar_maskz - ;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_permwvar_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0) %3 = bitcast i32 %mask to <32 x i1> @@ -1380,9 +2493,45 @@ define <32 x i16> @stack_fold_permwvar_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 } define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pextrd - ;CHECK: vpextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill - ;CHECK: movl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload +; CHECK-LABEL: stack_fold_pextrd: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpextrd $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq ; add forces execution domain %1 = add <4 x i32> %a0, %a1 %2 = extractelement <4 x i32> %1, i32 1 @@ -1391,65 +2540,262 @@ define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) { } define i64 @stack_fold_pextrq(<2 x i64> %a0) { - ;CHECK-LABEL: stack_fold_pextrq - ;CHECK: vpextrq $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill - ;CHECK: movq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Reload +; CHECK-LABEL: stack_fold_pextrq: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: vpextrq $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = extractelement <2 x i64> %a0, i32 1 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() ret i64 %1 } define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) { - ;CHECK-LABEL: stack_fold_pinsrb - ;CHECK: vpinsrb $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload +; CHECK-LABEL: stack_fold_pinsrb: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpinsrb $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1 ret <16 x i8> %2 } define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) { - ;CHECK-LABEL: stack_fold_pinsrd - ;CHECK: vpinsrd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload +; CHECK-LABEL: stack_fold_pinsrd: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpinsrd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1 ret <4 x i32> %2 } define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) { - ;CHECK-LABEL: stack_fold_pinsrq - ;CHECK: vpinsrq $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload +; CHECK-LABEL: stack_fold_pinsrq: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpinsrq $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1 ret <2 x i64> %2 } define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) { - ;CHECK-LABEL: stack_fold_pinsrw - ;CHECK: vpinsrw $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload +; CHECK-LABEL: stack_fold_pinsrw: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1 ret <8 x i16> %2 } define <16 x i32> @stack_fold_vplzcntd(<16 x i32> %a0) { - ;CHECK-LABEL: stack_fold_vplzcntd - ;CHECK: vplzcntd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_vplzcntd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a0, i1 false) ret <16 x i32> %2 } define <8 x i64> @stack_fold_vplzcntq(<8 x i64> %a0) { - ;CHECK-LABEL: stack_fold_vplzcntq - ;CHECK: vplzcntq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_vplzcntq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a0, i1 false) ret <8 x i64> %2 } define <32 x i16> @stack_fold_pmaddubsw_zmm(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pmaddubsw_zmm - ;CHECK: vpmaddubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaddubsw_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1) ret <32 x i16> %2 @@ -1457,8 +2803,17 @@ define <32 x i16> @stack_fold_pmaddubsw_zmm(<64 x i8> %a0, <64 x i8> %a1) { declare <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>) nounwind readnone define <32 x i16> @stack_fold_pmaddubsw_zmm_mask(<32 x i16>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_pmaddubsw_zmm_mask - ;CHECK: vpmaddubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaddubsw_zmm_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1) %3 = bitcast i32 %mask to <32 x i1> @@ -1469,8 +2824,15 @@ define <32 x i16> @stack_fold_pmaddubsw_zmm_mask(<32 x i16>* %passthru, <64 x i8 } define <32 x i16> @stack_fold_pmaddubsw_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_pmaddubsw_zmm_maskz - ;CHECK: vpmaddubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaddubsw_zmm_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1) %3 = bitcast i32 %mask to <32 x i1> @@ -1479,8 +2841,14 @@ define <32 x i16> @stack_fold_pmaddubsw_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, } define <16 x i32> @stack_fold_pmaddwd_zmm(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_pmaddwd_zmm - ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaddwd_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1) ret <16 x i32> %2 @@ -1488,16 +2856,31 @@ define <16 x i32> @stack_fold_pmaddwd_zmm(<32 x i16> %a0, <32 x i16> %a1) { declare <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>) nounwind readnone define <16 x i32> @stack_fold_pmaddwd_zmm_commuted(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_pmaddwd_zmm_commuted: - ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaddwd_zmm_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0) ret <16 x i32> %2 } define <16 x i32> @stack_fold_pmaddwd_zmm_mask(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pmaddwd_zmm_mask - ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1) %3 = bitcast i16 %mask to <16 x i1> @@ -1508,8 +2891,17 @@ define <16 x i32> @stack_fold_pmaddwd_zmm_mask(<16 x i32>* %passthru, <32 x i16> } define <16 x i32> @stack_fold_pmaddwd_zmm_mask_commuted(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pmaddwd_zmm_mask_commuted: - ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0) %3 = bitcast i16 %mask to <16 x i1> @@ -1520,8 +2912,15 @@ define <16 x i32> @stack_fold_pmaddwd_zmm_mask_commuted(<16 x i32>* %passthru, < } define <16 x i32> @stack_fold_pmaddwd_zmm_maskz(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz - ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1) %3 = bitcast i16 %mask to <16 x i1> @@ -1530,8 +2929,15 @@ define <16 x i32> @stack_fold_pmaddwd_zmm_maskz(<16 x i32>* %passthru, <32 x i16 } define <16 x i32> @stack_fold_pmaddwd_zmm_maskz_commuted(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz_commuted: - ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0) %3 = bitcast i16 %mask to <16 x i1> @@ -1540,8 +2946,14 @@ define <16 x i32> @stack_fold_pmaddwd_zmm_maskz_commuted(<16 x i32>* %passthru, } define <64 x i8> @stack_fold_pmaxsb(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pmaxsb - ;CHECK: vpmaxsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsb: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <64 x i8> %a0, %a1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 @@ -1549,8 +2961,14 @@ define <64 x i8> @stack_fold_pmaxsb(<64 x i8> %a0, <64 x i8> %a1) { } define <64 x i8> @stack_fold_pmaxsb_commuted(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pmaxsb_commuted: - ;CHECK: vpmaxsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsb_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <64 x i8> %a1, %a0 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 @@ -1558,8 +2976,17 @@ define <64 x i8> @stack_fold_pmaxsb_commuted(<64 x i8> %a0, <64 x i8> %a1) { } define <64 x i8> @stack_fold_pmaxsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { - ;CHECK-LABEL: stack_fold_pmaxsb_mask: - ;CHECK: vpmaxsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsb_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <64 x i8> %a0, %a1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 @@ -1571,8 +2998,17 @@ define <64 x i8> @stack_fold_pmaxsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask } define <64 x i8> @stack_fold_pmaxsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { - ;CHECK-LABEL: stack_fold_pmaxsb_mask_commuted: - ;CHECK: vpmaxsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsb_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <64 x i8> %a1, %a0 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 @@ -1584,8 +3020,15 @@ define <64 x i8> @stack_fold_pmaxsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, } define <64 x i8> @stack_fold_pmaxsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_pmaxsb_maskz: - ;CHECK: vpmaxsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsb_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <64 x i8> %a0, %a1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 @@ -1595,8 +3038,15 @@ define <64 x i8> @stack_fold_pmaxsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mas } define <64 x i8> @stack_fold_pmaxsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_pmaxsb_maskz_commuted: - ;CHECK: vpmaxsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsb_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <64 x i8> %a1, %a0 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 @@ -1606,8 +3056,14 @@ define <64 x i8> @stack_fold_pmaxsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, } define <16 x i32> @stack_fold_pmaxsd(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pmaxsd - ;CHECK: vpmaxsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <16 x i32> %a0, %a1 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 @@ -1615,8 +3071,14 @@ define <16 x i32> @stack_fold_pmaxsd(<16 x i32> %a0, <16 x i32> %a1) { } define <16 x i32> @stack_fold_pmaxsd_commuted(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pmaxsd_commuted: - ;CHECK: vpmaxsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsd_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <16 x i32> %a1, %a0 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 @@ -1624,8 +3086,17 @@ define <16 x i32> @stack_fold_pmaxsd_commuted(<16 x i32> %a0, <16 x i32> %a1) { } define <16 x i32> @stack_fold_pmaxsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { - ;CHECK-LABEL: stack_fold_pmaxsd_mask: - ;CHECK: vpmaxsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsd_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <16 x i32> %a0, %a1 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 @@ -1637,8 +3108,17 @@ define <16 x i32> @stack_fold_pmaxsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %m } define <16 x i32> @stack_fold_pmaxsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { - ;CHECK-LABEL: stack_fold_pmaxsd_mask_commuted: - ;CHECK: vpmaxsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsd_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <16 x i32> %a1, %a0 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 @@ -1650,8 +3130,15 @@ define <16 x i32> @stack_fold_pmaxsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a } define <16 x i32> @stack_fold_pmaxsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pmaxsd_maskz: - ;CHECK: vpmaxsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsd_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <16 x i32> %a0, %a1 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 @@ -1661,8 +3148,15 @@ define <16 x i32> @stack_fold_pmaxsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 % } define <16 x i32> @stack_fold_pmaxsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pmaxsd_maskz_commuted: - ;CHECK: vpmaxsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsd_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <16 x i32> %a1, %a0 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 @@ -1672,8 +3166,14 @@ define <16 x i32> @stack_fold_pmaxsd_maskz_commuted(<16 x i32> %a0, <16 x i32> % } define <8 x i64> @stack_fold_pmaxsq(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pmaxsq - ;CHECK: vpmaxsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <8 x i64> %a0, %a1 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 @@ -1681,8 +3181,14 @@ define <8 x i64> @stack_fold_pmaxsq(<8 x i64> %a0, <8 x i64> %a1) { } define <8 x i64> @stack_fold_pmaxsq_commuted(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pmaxsq_commuted: - ;CHECK: vpmaxsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsq_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <8 x i64> %a1, %a0 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 @@ -1690,8 +3196,17 @@ define <8 x i64> @stack_fold_pmaxsq_commuted(<8 x i64> %a0, <8 x i64> %a1) { } define <8 x i64> @stack_fold_pmaxsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { - ;CHECK-LABEL: stack_fold_pmaxsq_mask: - ;CHECK: vpmaxsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsq_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <8 x i64> %a0, %a1 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 @@ -1703,8 +3218,17 @@ define <8 x i64> @stack_fold_pmaxsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, } define <8 x i64> @stack_fold_pmaxsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { - ;CHECK-LABEL: stack_fold_pmaxsq_mask_commuted: - ;CHECK: vpmaxsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsq_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <8 x i64> %a1, %a0 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 @@ -1716,8 +3240,15 @@ define <8 x i64> @stack_fold_pmaxsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, } define <8 x i64> @stack_fold_pmaxsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmaxsq_maskz: - ;CHECK: vpmaxsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsq_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <8 x i64> %a0, %a1 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 @@ -1727,8 +3258,15 @@ define <8 x i64> @stack_fold_pmaxsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask } define <8 x i64> @stack_fold_pmaxsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmaxsq_maskz_commuted: - ;CHECK: vpmaxsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsq_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <8 x i64> %a1, %a0 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 @@ -1738,8 +3276,14 @@ define <8 x i64> @stack_fold_pmaxsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, } define <32 x i16> @stack_fold_pmaxsw(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_pmaxsw - ;CHECK: vpmaxsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <32 x i16> %a0, %a1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 @@ -1747,8 +3291,14 @@ define <32 x i16> @stack_fold_pmaxsw(<32 x i16> %a0, <32 x i16> %a1) { } define <32 x i16> @stack_fold_pmaxsw_commuted(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_pmaxsw_commuted: - ;CHECK: vpmaxsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsw_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <32 x i16> %a1, %a0 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 @@ -1756,8 +3306,17 @@ define <32 x i16> @stack_fold_pmaxsw_commuted(<32 x i16> %a0, <32 x i16> %a1) { } define <32 x i16> @stack_fold_pmaxsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { - ;CHECK-LABEL: stack_fold_pmaxsw_mask: - ;CHECK: vpmaxsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsw_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <32 x i16> %a0, %a1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 @@ -1769,8 +3328,17 @@ define <32 x i16> @stack_fold_pmaxsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %m } define <32 x i16> @stack_fold_pmaxsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { - ;CHECK-LABEL: stack_fold_pmaxsw_mask_commuted: - ;CHECK: vpmaxsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsw_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <32 x i16> %a1, %a0 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 @@ -1782,8 +3350,15 @@ define <32 x i16> @stack_fold_pmaxsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a } define <32 x i16> @stack_fold_pmaxsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_pmaxsw_maskz: - ;CHECK: vpmaxsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsw_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <32 x i16> %a0, %a1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 @@ -1793,8 +3368,15 @@ define <32 x i16> @stack_fold_pmaxsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 % } define <32 x i16> @stack_fold_pmaxsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_pmaxsw_maskz_commuted: - ;CHECK: vpmaxsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxsw_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp sgt <32 x i16> %a1, %a0 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 @@ -1804,8 +3386,14 @@ define <32 x i16> @stack_fold_pmaxsw_maskz_commuted(<32 x i16> %a0, <32 x i16> % } define <64 x i8> @stack_fold_pmaxub(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pmaxub - ;CHECK: vpmaxub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxub: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <64 x i8> %a0, %a1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 @@ -1813,8 +3401,14 @@ define <64 x i8> @stack_fold_pmaxub(<64 x i8> %a0, <64 x i8> %a1) { } define <64 x i8> @stack_fold_pmaxub_commuted(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pmaxub_commuted: - ;CHECK: vpmaxub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxub_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <64 x i8> %a1, %a0 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 @@ -1822,8 +3416,17 @@ define <64 x i8> @stack_fold_pmaxub_commuted(<64 x i8> %a0, <64 x i8> %a1) { } define <64 x i8> @stack_fold_pmaxub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { - ;CHECK-LABEL: stack_fold_pmaxub_mask: - ;CHECK: vpmaxub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxub_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <64 x i8> %a0, %a1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 @@ -1835,8 +3438,17 @@ define <64 x i8> @stack_fold_pmaxub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask } define <64 x i8> @stack_fold_pmaxub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { - ;CHECK-LABEL: stack_fold_pmaxub_mask_commuted: - ;CHECK: vpmaxub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxub_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <64 x i8> %a1, %a0 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 @@ -1848,8 +3460,15 @@ define <64 x i8> @stack_fold_pmaxub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, } define <64 x i8> @stack_fold_pmaxub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_pmaxub_maskz: - ;CHECK: vpmaxub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxub_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <64 x i8> %a0, %a1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 @@ -1859,8 +3478,15 @@ define <64 x i8> @stack_fold_pmaxub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mas } define <64 x i8> @stack_fold_pmaxub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_pmaxub_maskz_commuted: - ;CHECK: vpmaxub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxub_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <64 x i8> %a1, %a0 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 @@ -1870,8 +3496,14 @@ define <64 x i8> @stack_fold_pmaxub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, } define <16 x i32> @stack_fold_pmaxud(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pmaxud - ;CHECK: vpmaxud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxud: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <16 x i32> %a0, %a1 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 @@ -1879,8 +3511,14 @@ define <16 x i32> @stack_fold_pmaxud(<16 x i32> %a0, <16 x i32> %a1) { } define <16 x i32> @stack_fold_pmaxud_commuted(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pmaxud_commuted: - ;CHECK: vpmaxud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxud_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <16 x i32> %a1, %a0 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 @@ -1888,8 +3526,17 @@ define <16 x i32> @stack_fold_pmaxud_commuted(<16 x i32> %a0, <16 x i32> %a1) { } define <16 x i32> @stack_fold_pmaxud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { - ;CHECK-LABEL: stack_fold_pmaxud_mask: - ;CHECK: vpmaxud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxud_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <16 x i32> %a0, %a1 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 @@ -1901,8 +3548,17 @@ define <16 x i32> @stack_fold_pmaxud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %m } define <16 x i32> @stack_fold_pmaxud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { - ;CHECK-LABEL: stack_fold_pmaxud_mask_commuted: - ;CHECK: vpmaxud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxud_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <16 x i32> %a1, %a0 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 @@ -1914,8 +3570,15 @@ define <16 x i32> @stack_fold_pmaxud_mask_commuted(<16 x i32> %a0, <16 x i32> %a } define <16 x i32> @stack_fold_pmaxud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pmaxud_maskz: - ;CHECK: vpmaxud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxud_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <16 x i32> %a0, %a1 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 @@ -1925,8 +3588,15 @@ define <16 x i32> @stack_fold_pmaxud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 % } define <16 x i32> @stack_fold_pmaxud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pmaxud_maskz_commuted: - ;CHECK: vpmaxud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxud_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <16 x i32> %a1, %a0 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 @@ -1936,8 +3606,14 @@ define <16 x i32> @stack_fold_pmaxud_maskz_commuted(<16 x i32> %a0, <16 x i32> % } define <8 x i64> @stack_fold_pmaxuq(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pmaxuq - ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxuq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <8 x i64> %a0, %a1 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 @@ -1945,8 +3621,14 @@ define <8 x i64> @stack_fold_pmaxuq(<8 x i64> %a0, <8 x i64> %a1) { } define <8 x i64> @stack_fold_pmaxuq_commuted(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pmaxuq_commuted: - ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxuq_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <8 x i64> %a1, %a0 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 @@ -1954,8 +3636,17 @@ define <8 x i64> @stack_fold_pmaxuq_commuted(<8 x i64> %a0, <8 x i64> %a1) { } define <8 x i64> @stack_fold_pmaxuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { - ;CHECK-LABEL: stack_fold_pmaxuq_mask: - ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxuq_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <8 x i64> %a0, %a1 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 @@ -1967,8 +3658,17 @@ define <8 x i64> @stack_fold_pmaxuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, } define <8 x i64> @stack_fold_pmaxuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { - ;CHECK-LABEL: stack_fold_pmaxuq_mask_commuted: - ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxuq_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <8 x i64> %a1, %a0 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 @@ -1980,8 +3680,15 @@ define <8 x i64> @stack_fold_pmaxuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, } define <8 x i64> @stack_fold_pmaxuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmaxuq_maskz: - ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxuq_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <8 x i64> %a0, %a1 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 @@ -1991,8 +3698,15 @@ define <8 x i64> @stack_fold_pmaxuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask } define <8 x i64> @stack_fold_pmaxuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmaxuq_maskz_commuted: - ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxuq_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <8 x i64> %a1, %a0 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 @@ -2002,8 +3716,14 @@ define <8 x i64> @stack_fold_pmaxuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, } define <32 x i16> @stack_fold_pmaxuw(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_pmaxuw - ;CHECK: vpmaxuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxuw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <32 x i16> %a0, %a1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 @@ -2011,8 +3731,14 @@ define <32 x i16> @stack_fold_pmaxuw(<32 x i16> %a0, <32 x i16> %a1) { } define <32 x i16> @stack_fold_pmaxuw_commuted(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_pmaxuw_commuted: - ;CHECK: vpmaxuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxuw_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <32 x i16> %a1, %a0 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 @@ -2020,8 +3746,17 @@ define <32 x i16> @stack_fold_pmaxuw_commuted(<32 x i16> %a0, <32 x i16> %a1) { } define <32 x i16> @stack_fold_pmaxuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { - ;CHECK-LABEL: stack_fold_pmaxuw_mask: - ;CHECK: vpmaxuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxuw_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <32 x i16> %a0, %a1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 @@ -2033,8 +3768,17 @@ define <32 x i16> @stack_fold_pmaxuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %m } define <32 x i16> @stack_fold_pmaxuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { - ;CHECK-LABEL: stack_fold_pmaxuw_mask_commuted: - ;CHECK: vpmaxuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxuw_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <32 x i16> %a1, %a0 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 @@ -2046,8 +3790,15 @@ define <32 x i16> @stack_fold_pmaxuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a } define <32 x i16> @stack_fold_pmaxuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_pmaxuw_maskz: - ;CHECK: vpmaxuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxuw_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <32 x i16> %a0, %a1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 @@ -2057,8 +3808,15 @@ define <32 x i16> @stack_fold_pmaxuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 % } define <32 x i16> @stack_fold_pmaxuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_pmaxuw_maskz_commuted: - ;CHECK: vpmaxuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmaxuw_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ugt <32 x i16> %a1, %a0 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 @@ -2068,8 +3826,14 @@ define <32 x i16> @stack_fold_pmaxuw_maskz_commuted(<32 x i16> %a0, <32 x i16> % } define <64 x i8> @stack_fold_pminsb(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pminsb - ;CHECK: vpminsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsb: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <64 x i8> %a0, %a1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 @@ -2077,8 +3841,14 @@ define <64 x i8> @stack_fold_pminsb(<64 x i8> %a0, <64 x i8> %a1) { } define <64 x i8> @stack_fold_pminsb_commuted(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pminsb_commuted: - ;CHECK: vpminsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsb_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <64 x i8> %a1, %a0 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 @@ -2086,8 +3856,17 @@ define <64 x i8> @stack_fold_pminsb_commuted(<64 x i8> %a0, <64 x i8> %a1) { } define <64 x i8> @stack_fold_pminsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { - ;CHECK-LABEL: stack_fold_pminsb_mask: - ;CHECK: vpminsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsb_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <64 x i8> %a0, %a1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 @@ -2099,8 +3878,17 @@ define <64 x i8> @stack_fold_pminsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask } define <64 x i8> @stack_fold_pminsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { - ;CHECK-LABEL: stack_fold_pminsb_mask_commuted: - ;CHECK: vpminsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsb_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <64 x i8> %a1, %a0 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 @@ -2112,8 +3900,15 @@ define <64 x i8> @stack_fold_pminsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, } define <64 x i8> @stack_fold_pminsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_pminsb_maskz: - ;CHECK: vpminsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsb_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <64 x i8> %a0, %a1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 @@ -2123,8 +3918,15 @@ define <64 x i8> @stack_fold_pminsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mas } define <64 x i8> @stack_fold_pminsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_pminsb_maskz_commuted: - ;CHECK: vpminsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsb_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <64 x i8> %a1, %a0 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 @@ -2134,8 +3936,14 @@ define <64 x i8> @stack_fold_pminsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, } define <16 x i32> @stack_fold_pminsd(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pminsd - ;CHECK: vpminsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <16 x i32> %a0, %a1 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 @@ -2143,8 +3951,14 @@ define <16 x i32> @stack_fold_pminsd(<16 x i32> %a0, <16 x i32> %a1) { } define <16 x i32> @stack_fold_pminsd_commuted(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pminsd_commuted: - ;CHECK: vpminsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsd_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <16 x i32> %a1, %a0 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 @@ -2152,8 +3966,17 @@ define <16 x i32> @stack_fold_pminsd_commuted(<16 x i32> %a0, <16 x i32> %a1) { } define <16 x i32> @stack_fold_pminsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { - ;CHECK-LABEL: stack_fold_pminsd_mask: - ;CHECK: vpminsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsd_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <16 x i32> %a0, %a1 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 @@ -2165,8 +3988,17 @@ define <16 x i32> @stack_fold_pminsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %m } define <16 x i32> @stack_fold_pminsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { - ;CHECK-LABEL: stack_fold_pminsd_mask_commuted: - ;CHECK: vpminsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsd_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <16 x i32> %a1, %a0 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 @@ -2178,8 +4010,15 @@ define <16 x i32> @stack_fold_pminsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a } define <16 x i32> @stack_fold_pminsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pminsd_maskz: - ;CHECK: vpminsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsd_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <16 x i32> %a0, %a1 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 @@ -2189,8 +4028,15 @@ define <16 x i32> @stack_fold_pminsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 % } define <16 x i32> @stack_fold_pminsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pminsd_maskz_commuted: - ;CHECK: vpminsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsd_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <16 x i32> %a1, %a0 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 @@ -2200,8 +4046,14 @@ define <16 x i32> @stack_fold_pminsd_maskz_commuted(<16 x i32> %a0, <16 x i32> % } define <8 x i64> @stack_fold_pminsq(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pminsq - ;CHECK: vpminsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <8 x i64> %a0, %a1 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 @@ -2209,8 +4061,14 @@ define <8 x i64> @stack_fold_pminsq(<8 x i64> %a0, <8 x i64> %a1) { } define <8 x i64> @stack_fold_pminsq_commuted(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pminsq_commuted: - ;CHECK: vpminsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsq_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <8 x i64> %a1, %a0 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 @@ -2218,8 +4076,17 @@ define <8 x i64> @stack_fold_pminsq_commuted(<8 x i64> %a0, <8 x i64> %a1) { } define <8 x i64> @stack_fold_pminsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { - ;CHECK-LABEL: stack_fold_pminsq_mask: - ;CHECK: vpminsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsq_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <8 x i64> %a0, %a1 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 @@ -2231,8 +4098,17 @@ define <8 x i64> @stack_fold_pminsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, } define <8 x i64> @stack_fold_pminsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { - ;CHECK-LABEL: stack_fold_pminsq_mask_commuted: - ;CHECK: vpminsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsq_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <8 x i64> %a1, %a0 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 @@ -2244,8 +4120,15 @@ define <8 x i64> @stack_fold_pminsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, } define <8 x i64> @stack_fold_pminsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pminsq_maskz: - ;CHECK: vpminsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsq_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <8 x i64> %a0, %a1 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 @@ -2255,8 +4138,15 @@ define <8 x i64> @stack_fold_pminsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask } define <8 x i64> @stack_fold_pminsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pminsq_maskz_commuted: - ;CHECK: vpminsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsq_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <8 x i64> %a1, %a0 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 @@ -2266,8 +4156,14 @@ define <8 x i64> @stack_fold_pminsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, } define <32 x i16> @stack_fold_pminsw(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_pminsw - ;CHECK: vpminsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <32 x i16> %a0, %a1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 @@ -2275,8 +4171,14 @@ define <32 x i16> @stack_fold_pminsw(<32 x i16> %a0, <32 x i16> %a1) { } define <32 x i16> @stack_fold_pminsw_commuted(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_pminsw_commuted: - ;CHECK: vpminsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsw_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <32 x i16> %a1, %a0 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 @@ -2284,8 +4186,17 @@ define <32 x i16> @stack_fold_pminsw_commuted(<32 x i16> %a0, <32 x i16> %a1) { } define <32 x i16> @stack_fold_pminsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { - ;CHECK-LABEL: stack_fold_pminsw_mask: - ;CHECK: vpminsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsw_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <32 x i16> %a0, %a1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 @@ -2297,8 +4208,17 @@ define <32 x i16> @stack_fold_pminsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %m } define <32 x i16> @stack_fold_pminsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { - ;CHECK-LABEL: stack_fold_pminsw_mask_commuted: - ;CHECK: vpminsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsw_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <32 x i16> %a1, %a0 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 @@ -2310,8 +4230,15 @@ define <32 x i16> @stack_fold_pminsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a } define <32 x i16> @stack_fold_pminsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_pminsw_maskz: - ;CHECK: vpminsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsw_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <32 x i16> %a0, %a1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 @@ -2321,8 +4248,15 @@ define <32 x i16> @stack_fold_pminsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 % } define <32 x i16> @stack_fold_pminsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_pminsw_maskz_commuted: - ;CHECK: vpminsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminsw_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp slt <32 x i16> %a1, %a0 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 @@ -2332,8 +4266,14 @@ define <32 x i16> @stack_fold_pminsw_maskz_commuted(<32 x i16> %a0, <32 x i16> % } define <64 x i8> @stack_fold_pminub(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pminub - ;CHECK: vpminub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminub: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <64 x i8> %a0, %a1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 @@ -2341,8 +4281,14 @@ define <64 x i8> @stack_fold_pminub(<64 x i8> %a0, <64 x i8> %a1) { } define <64 x i8> @stack_fold_pminub_commuted(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pminub_commuted: - ;CHECK: vpminub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminub_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <64 x i8> %a1, %a0 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 @@ -2350,8 +4296,17 @@ define <64 x i8> @stack_fold_pminub_commuted(<64 x i8> %a0, <64 x i8> %a1) { } define <64 x i8> @stack_fold_pminub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { - ;CHECK-LABEL: stack_fold_pminub_mask: - ;CHECK: vpminub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminub_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <64 x i8> %a0, %a1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 @@ -2363,8 +4318,17 @@ define <64 x i8> @stack_fold_pminub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask } define <64 x i8> @stack_fold_pminub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { - ;CHECK-LABEL: stack_fold_pminub_mask_commuted: - ;CHECK: vpminub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminub_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <64 x i8> %a1, %a0 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 @@ -2376,8 +4340,15 @@ define <64 x i8> @stack_fold_pminub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, } define <64 x i8> @stack_fold_pminub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_pminub_maskz: - ;CHECK: vpminub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminub_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <64 x i8> %a0, %a1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 @@ -2387,8 +4358,15 @@ define <64 x i8> @stack_fold_pminub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mas } define <64 x i8> @stack_fold_pminub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_pminub_maskz_commuted: - ;CHECK: vpminub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminub_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <64 x i8> %a1, %a0 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 @@ -2398,8 +4376,14 @@ define <64 x i8> @stack_fold_pminub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, } define <16 x i32> @stack_fold_pminud(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pminud - ;CHECK: vpminud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminud: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <16 x i32> %a0, %a1 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 @@ -2407,8 +4391,14 @@ define <16 x i32> @stack_fold_pminud(<16 x i32> %a0, <16 x i32> %a1) { } define <16 x i32> @stack_fold_pminud_commuted(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pminud_commuted: - ;CHECK: vpminud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminud_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <16 x i32> %a1, %a0 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 @@ -2416,8 +4406,17 @@ define <16 x i32> @stack_fold_pminud_commuted(<16 x i32> %a0, <16 x i32> %a1) { } define <16 x i32> @stack_fold_pminud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { - ;CHECK-LABEL: stack_fold_pminud_mask: - ;CHECK: vpminud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminud_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <16 x i32> %a0, %a1 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 @@ -2429,8 +4428,17 @@ define <16 x i32> @stack_fold_pminud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %m } define <16 x i32> @stack_fold_pminud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { - ;CHECK-LABEL: stack_fold_pminud_mask_commuted: - ;CHECK: vpminud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminud_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <16 x i32> %a1, %a0 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 @@ -2442,8 +4450,15 @@ define <16 x i32> @stack_fold_pminud_mask_commuted(<16 x i32> %a0, <16 x i32> %a } define <16 x i32> @stack_fold_pminud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pminud_maskz: - ;CHECK: vpminud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminud_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <16 x i32> %a0, %a1 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 @@ -2453,8 +4468,15 @@ define <16 x i32> @stack_fold_pminud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 % } define <16 x i32> @stack_fold_pminud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pminud_maskz_commuted: - ;CHECK: vpminud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminud_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <16 x i32> %a1, %a0 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 @@ -2464,8 +4486,14 @@ define <16 x i32> @stack_fold_pminud_maskz_commuted(<16 x i32> %a0, <16 x i32> % } define <8 x i64> @stack_fold_pminuq(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pminuq - ;CHECK: vpminuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminuq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <8 x i64> %a0, %a1 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 @@ -2473,8 +4501,14 @@ define <8 x i64> @stack_fold_pminuq(<8 x i64> %a0, <8 x i64> %a1) { } define <8 x i64> @stack_fold_pminuq_commuted(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pminuq_commuted: - ;CHECK: vpminuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminuq_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <8 x i64> %a1, %a0 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 @@ -2482,8 +4516,17 @@ define <8 x i64> @stack_fold_pminuq_commuted(<8 x i64> %a0, <8 x i64> %a1) { } define <8 x i64> @stack_fold_pminuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { - ;CHECK-LABEL: stack_fold_pminuq_mask: - ;CHECK: vpminuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminuq_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <8 x i64> %a0, %a1 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 @@ -2495,8 +4538,17 @@ define <8 x i64> @stack_fold_pminuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, } define <8 x i64> @stack_fold_pminuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { - ;CHECK-LABEL: stack_fold_pminuq_mask_commuted: - ;CHECK: vpminuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminuq_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <8 x i64> %a1, %a0 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 @@ -2508,8 +4560,15 @@ define <8 x i64> @stack_fold_pminuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, } define <8 x i64> @stack_fold_pminuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pminuq_maskz: - ;CHECK: vpminuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminuq_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <8 x i64> %a0, %a1 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 @@ -2519,8 +4578,15 @@ define <8 x i64> @stack_fold_pminuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask } define <8 x i64> @stack_fold_pminuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pminuq_maskz_commuted: - ;CHECK: vpminuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminuq_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <8 x i64> %a1, %a0 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 @@ -2530,8 +4596,14 @@ define <8 x i64> @stack_fold_pminuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, } define <32 x i16> @stack_fold_pminuw(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_pminuw - ;CHECK: vpminuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminuw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <32 x i16> %a0, %a1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 @@ -2539,8 +4611,14 @@ define <32 x i16> @stack_fold_pminuw(<32 x i16> %a0, <32 x i16> %a1) { } define <32 x i16> @stack_fold_pminuw_commuted(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_pminuw_commuted: - ;CHECK: vpminuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminuw_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <32 x i16> %a1, %a0 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 @@ -2548,8 +4626,17 @@ define <32 x i16> @stack_fold_pminuw_commuted(<32 x i16> %a0, <32 x i16> %a1) { } define <32 x i16> @stack_fold_pminuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { - ;CHECK-LABEL: stack_fold_pminuw_mask: - ;CHECK: vpminuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminuw_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <32 x i16> %a0, %a1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 @@ -2561,8 +4648,17 @@ define <32 x i16> @stack_fold_pminuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %m } define <32 x i16> @stack_fold_pminuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { - ;CHECK-LABEL: stack_fold_pminuw_mask_commuted: - ;CHECK: vpminuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminuw_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <32 x i16> %a1, %a0 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 @@ -2574,8 +4670,15 @@ define <32 x i16> @stack_fold_pminuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a } define <32 x i16> @stack_fold_pminuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_pminuw_maskz: - ;CHECK: vpminuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminuw_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <32 x i16> %a0, %a1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 @@ -2585,8 +4688,15 @@ define <32 x i16> @stack_fold_pminuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 % } define <32 x i16> @stack_fold_pminuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_pminuw_maskz_commuted: - ;CHECK: vpminuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pminuw_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = icmp ult <32 x i16> %a1, %a0 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 @@ -2596,8 +4706,15 @@ define <32 x i16> @stack_fold_pminuw_maskz_commuted(<32 x i16> %a0, <32 x i16> % } define <16 x i8> @stack_fold_vpmovdb(<16 x i32> %a0) { - ;CHECK-LABEL: stack_fold_vpmovdb - ;CHECK: vpmovdb %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill +; CHECK-LABEL: stack_fold_vpmovdb: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <16 x i8> %1 @@ -2605,8 +4722,14 @@ define <16 x i8> @stack_fold_vpmovdb(<16 x i32> %a0) { declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16) define <16 x i16> @stack_fold_vpmovdw(<16 x i32> %a0) { - ;CHECK-LABEL: stack_fold_vpmovdw - ;CHECK: vpmovdw %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill +; CHECK-LABEL: stack_fold_vpmovdw: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: retq %1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <16 x i16> %1 @@ -2614,8 +4737,17 @@ define <16 x i16> @stack_fold_vpmovdw(<16 x i32> %a0) { declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16) define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) { - ;CHECK-LABEL: stack_fold_movq_load - ;CHECK: vmovq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_movq_load: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2> ; add forces execution domain @@ -2624,8 +4756,14 @@ define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) { } define <8 x i32> @stack_fold_vpmovqd(<8 x i64> %a0) { - ;CHECK-LABEL: stack_fold_vpmovqd - ;CHECK: vpmovqd %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill +; CHECK-LABEL: stack_fold_vpmovqd: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: retq %1 = trunc <8 x i64> %a0 to <8 x i32> %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <8 x i32> %1 @@ -2633,8 +4771,15 @@ define <8 x i32> @stack_fold_vpmovqd(<8 x i64> %a0) { declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8) define <8 x i16> @stack_fold_vpmovqw(<8 x i64> %a0) { - ;CHECK-LABEL: stack_fold_vpmovqw - ;CHECK: vpmovqw %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill +; CHECK-LABEL: stack_fold_vpmovqw: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <8 x i16> %1 @@ -2642,8 +4787,14 @@ define <8 x i16> @stack_fold_vpmovqw(<8 x i64> %a0) { declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8) define <32 x i8> @stack_fold_vpmovwb(<32 x i16> %a0) { - ;CHECK-LABEL: stack_fold_vpmovwb - ;CHECK: vpmovwb %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill +; CHECK-LABEL: stack_fold_vpmovwb: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovwb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: retq %1 = trunc <32 x i16> %a0 to <32 x i8> %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <32 x i8> %1 @@ -2651,8 +4802,15 @@ define <32 x i8> @stack_fold_vpmovwb(<32 x i16> %a0) { declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32) define <16 x i8> @stack_fold_vpmovsdb(<16 x i32> %a0) { - ;CHECK-LABEL: stack_fold_vpmovsdb - ;CHECK: vpmovsdb %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill +; CHECK-LABEL: stack_fold_vpmovsdb: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <16 x i8> %1 @@ -2660,8 +4818,14 @@ define <16 x i8> @stack_fold_vpmovsdb(<16 x i32> %a0) { declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16) define <16 x i16> @stack_fold_vpmovsdw(<16 x i32> %a0) { - ;CHECK-LABEL: stack_fold_vpmovsdw - ;CHECK: vpmovsdw %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill +; CHECK-LABEL: stack_fold_vpmovsdw: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: retq %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <16 x i16> %1 @@ -2669,8 +4833,14 @@ define <16 x i16> @stack_fold_vpmovsdw(<16 x i32> %a0) { declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16) define <8 x i32> @stack_fold_vpmovsqd(<8 x i64> %a0) { - ;CHECK-LABEL: stack_fold_vpmovsqd - ;CHECK: vpmovsqd %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill +; CHECK-LABEL: stack_fold_vpmovsqd: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: retq %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <8 x i32> %1 @@ -2678,8 +4848,15 @@ define <8 x i32> @stack_fold_vpmovsqd(<8 x i64> %a0) { declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8) define <8 x i16> @stack_fold_vpmovsqw(<8 x i64> %a0) { - ;CHECK-LABEL: stack_fold_vpmovsqw - ;CHECK: vpmovsqw %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill +; CHECK-LABEL: stack_fold_vpmovsqw: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <8 x i16> %1 @@ -2687,8 +4864,14 @@ define <8 x i16> @stack_fold_vpmovsqw(<8 x i64> %a0) { declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8) define <32 x i8> @stack_fold_vpmovswb(<32 x i16> %a0) { - ;CHECK-LABEL: stack_fold_vpmovswb - ;CHECK: vpmovswb %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill +; CHECK-LABEL: stack_fold_vpmovswb: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovswb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: retq %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <32 x i8> %1 @@ -2696,16 +4879,28 @@ define <32 x i8> @stack_fold_vpmovswb(<32 x i16> %a0) { declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32) define <16 x i32> @stack_fold_pmovsxbd_zmm(<16 x i8> %a0) { - ;CHECK-LABEL: stack_fold_pmovsxbd_zmm - ;CHECK: vpmovsxbd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_pmovsxbd_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = sext <16 x i8> %a0 to <16 x i32> ret <16 x i32> %2 } define <8 x i64> @stack_fold_pmovsxbq_zmm(<16 x i8> %a0) { - ;CHECK-LABEL: stack_fold_pmovsxbq_zmm - ;CHECK: pmovsxbq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_pmovsxbq_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %3 = sext <8 x i8> %2 to <8 x i64> @@ -2713,40 +4908,71 @@ define <8 x i64> @stack_fold_pmovsxbq_zmm(<16 x i8> %a0) { } define <32 x i16> @stack_fold_pmovsxbw_zmm(<32 x i8> %a0) { - ;CHECK-LABEL: stack_fold_pmovsxbw_zmm - ;CHECK: vpmovsxbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload +; CHECK-LABEL: stack_fold_pmovsxbw_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = sext <32 x i8> %a0 to <32 x i16> ret <32 x i16> %2 } define <8 x i64> @stack_fold_pmovsxdq_zmm(<8 x i32> %a0) { - ;CHECK-LABEL: stack_fold_pmovsxdq_zmm - ;CHECK: vpmovsxdq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload +; CHECK-LABEL: stack_fold_pmovsxdq_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = sext <8 x i32> %a0 to <8 x i64> ret <8 x i64> %2 } define <16 x i32> @stack_fold_pmovsxwd_zmm(<16 x i16> %a0) { - ;CHECK-LABEL: stack_fold_pmovsxwd_zmm - ;CHECK: vpmovsxwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload +; CHECK-LABEL: stack_fold_pmovsxwd_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = sext <16 x i16> %a0 to <16 x i32> ret <16 x i32> %2 } define <8 x i64> @stack_fold_pmovsxwq_zmm(<8 x i16> %a0) { - ;CHECK-LABEL: stack_fold_pmovsxwq_zmm - ;CHECK: vpmovsxwq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_pmovsxwq_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = sext <8 x i16> %a0 to <8 x i64> ret <8 x i64> %2 } define <8 x i64> @stack_fold_pmovsxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmovsxwq_mask_zmm - ;CHECK: vpmovsxwq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_pmovsxwq_mask_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 16-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = sext <8 x i16> %a0 to <8 x i64> %3 = bitcast i8 %mask to <8 x i1> @@ -2755,8 +4981,15 @@ define <8 x i64> @stack_fold_pmovsxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a } define <8 x i64> @stack_fold_pmovsxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmovsxwq_maskz_zmm - ;CHECK: vpmovsxwq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_pmovsxwq_maskz_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 16-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = sext <8 x i16> %a0 to <8 x i64> %3 = bitcast i8 %mask to <8 x i1> @@ -2765,8 +4998,15 @@ define <8 x i64> @stack_fold_pmovsxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) { } define <16 x i8> @stack_fold_vpmovusdb(<16 x i32> %a0) { - ;CHECK-LABEL: stack_fold_vpmovusdb - ;CHECK: vpmovusdb %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill +; CHECK-LABEL: stack_fold_vpmovusdb: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovusdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <16 x i8> %1 @@ -2774,8 +5014,14 @@ define <16 x i8> @stack_fold_vpmovusdb(<16 x i32> %a0) { declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16) define <16 x i16> @stack_fold_vpmovusdw(<16 x i32> %a0) { - ;CHECK-LABEL: stack_fold_vpmovusdw - ;CHECK: vpmovusdw %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill +; CHECK-LABEL: stack_fold_vpmovusdw: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovusdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: retq %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <16 x i16> %1 @@ -2783,8 +5029,14 @@ define <16 x i16> @stack_fold_vpmovusdw(<16 x i32> %a0) { declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16) define <8 x i32> @stack_fold_vpmovusqd(<8 x i64> %a0) { - ;CHECK-LABEL: stack_fold_vpmovusqd - ;CHECK: vpmovusqd %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill +; CHECK-LABEL: stack_fold_vpmovusqd: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovusqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: retq %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <8 x i32> %1 @@ -2792,8 +5044,15 @@ define <8 x i32> @stack_fold_vpmovusqd(<8 x i64> %a0) { declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8) define <8 x i16> @stack_fold_vpmovusqw(<8 x i64> %a0) { - ;CHECK-LABEL: stack_fold_vpmovusqw - ;CHECK: vpmovusqw %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill +; CHECK-LABEL: stack_fold_vpmovusqw: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovusqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <8 x i16> %1 @@ -2801,8 +5060,14 @@ define <8 x i16> @stack_fold_vpmovusqw(<8 x i64> %a0) { declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8) define <32 x i8> @stack_fold_vpmovuswb(<32 x i16> %a0) { - ;CHECK-LABEL: stack_fold_vpmovuswb - ;CHECK: vpmovuswb %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill +; CHECK-LABEL: stack_fold_vpmovuswb: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovuswb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: retq %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <32 x i8> %1 @@ -2810,16 +5075,30 @@ define <32 x i8> @stack_fold_vpmovuswb(<32 x i16> %a0) { declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32) define <16 x i32> @stack_fold_pmovzxbd_zmm(<16 x i8> %a0) { - ;CHECK-LABEL: stack_fold_pmovzxbd_zmm - ;CHECK: vpmovzxbd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_pmovzxbd_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload +; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = zext <16 x i8> %a0 to <16 x i32> ret <16 x i32> %2 } define <8 x i64> @stack_fold_pmovzxbq_zmm(<16 x i8> %a0) { - ;CHECK-LABEL: stack_fold_pmovzxbq_zmm - ;CHECK: vpmovzxbq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_pmovzxbq_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload +; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %3 = zext <8 x i8> %2 to <8 x i64> @@ -2827,40 +5106,76 @@ define <8 x i64> @stack_fold_pmovzxbq_zmm(<16 x i8> %a0) { } define <32 x i16> @stack_fold_pmovzxbw_zmm(<32 x i8> %a0) { - ;CHECK-LABEL: stack_fold_pmovzxbw_zmm - ;CHECK: vpmovzxbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload +; CHECK-LABEL: stack_fold_pmovzxbw_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload +; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = zext <32 x i8> %a0 to <32 x i16> ret <32 x i16> %2 } define <8 x i64> @stack_fold_pmovzxdq_zmm(<8 x i32> %a0) { - ;CHECK-LABEL: stack_fold_pmovzxdq_zmm - ;CHECK: vpmovzxdq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload +; CHECK-LABEL: stack_fold_pmovzxdq_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload +; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = zext <8 x i32> %a0 to <8 x i64> ret <8 x i64> %2 } define <16 x i32> @stack_fold_pmovzxwd_zmm(<16 x i16> %a0) { - ;CHECK-LABEL: stack_fold_pmovzxwd_zmm - ;CHECK: vpmovzxwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload +; CHECK-LABEL: stack_fold_pmovzxwd_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload +; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = zext <16 x i16> %a0 to <16 x i32> ret <16 x i32> %2 } define <8 x i64> @stack_fold_pmovzxwq_zmm(<8 x i16> %a0) { - ;CHECK-LABEL: stack_fold_pmovzxwq_zmm - ;CHECK: vpmovzxwq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_pmovzxwq_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload +; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = zext <8 x i16> %a0 to <8 x i64> ret <8 x i64> %2 } define <8 x i64> @stack_fold_pmovzxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmovzxwq_mask_zmm - ;CHECK: vpmovzxwq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_pmovzxwq_mask_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 16-byte Folded Reload +; CHECK-NEXT: # zmm0 {%k1} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = zext <8 x i16> %a0 to <8 x i64> %3 = bitcast i8 %mask to <8 x i1> @@ -2869,8 +5184,16 @@ define <8 x i64> @stack_fold_pmovzxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a } define <8 x i64> @stack_fold_pmovzxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmovzxwq_maskz_zmm - ;CHECK: vpmovzxwq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_pmovzxwq_maskz_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 16-byte Folded Reload +; CHECK-NEXT: # zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = zext <8 x i16> %a0 to <8 x i64> %3 = bitcast i8 %mask to <8 x i1> @@ -2879,24 +5202,45 @@ define <8 x i64> @stack_fold_pmovzxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) { } define <16 x i32> @stack_fold_pmulld(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pmulld - ;CHECK: vpmulld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmulld: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <16 x i32> %a0, %a1 ret <16 x i32> %2 } define <16 x i32> @stack_fold_pmulld_commuted(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pmulld_commuted: - ;CHECK: vpmulld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmulld_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <16 x i32> %a1, %a0 ret <16 x i32> %2 } define <16 x i32> @stack_fold_pmulld_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { - ;CHECK-LABEL: stack_fold_pmulld_mask - ;CHECK: vpmulld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmulld_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <16 x i32> %a0, %a1 %3 = bitcast i16 %mask to <16 x i1> @@ -2907,8 +5251,17 @@ define <16 x i32> @stack_fold_pmulld_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x } define <16 x i32> @stack_fold_pmulld_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { - ;CHECK-LABEL: stack_fold_pmulld_mask_commuted: - ;CHECK: vpmulld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmulld_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <16 x i32> %a1, %a0 %3 = bitcast i16 %mask to <16 x i1> @@ -2919,8 +5272,15 @@ define <16 x i32> @stack_fold_pmulld_mask_commuted(<16 x i32> %a0, <16 x i32> %a } define <16 x i32> @stack_fold_pmulld_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pmulld_maskz - ;CHECK: vpmulld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmulld_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <16 x i32> %a0, %a1 %3 = bitcast i16 %mask to <16 x i1> @@ -2929,8 +5289,15 @@ define <16 x i32> @stack_fold_pmulld_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 % } define <16 x i32> @stack_fold_pmulld_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pmulld_maskz_commuted: - ;CHECK: vpmulld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmulld_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <16 x i32> %a1, %a0 %3 = bitcast i16 %mask to <16 x i1> @@ -2939,24 +5306,45 @@ define <16 x i32> @stack_fold_pmulld_maskz_commuted(<16 x i32> %a0, <16 x i32> % } define <8 x i64> @stack_fold_pmullq(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pmullq - ;CHECK: vpmullq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmullq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <8 x i64> %a0, %a1 ret <8 x i64> %2 } define <8 x i64> @stack_fold_pmullq_commuted(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pmullq_commuted: - ;CHECK: vpmullq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmullq_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <8 x i64> %a1, %a0 ret <8 x i64> %2 } define <8 x i64> @stack_fold_pmullq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmullq_mask - ;CHECK: vpmullq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmullq_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <8 x i64> %a0, %a1 %3 = bitcast i8 %mask to <8 x i1> @@ -2967,8 +5355,17 @@ define <8 x i64> @stack_fold_pmullq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> } define <8 x i64> @stack_fold_pmullq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmullq_mask_commuted: - ;CHECK: vpmullq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmullq_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <8 x i64> %a1, %a0 %3 = bitcast i8 %mask to <8 x i1> @@ -2979,8 +5376,15 @@ define <8 x i64> @stack_fold_pmullq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, } define <8 x i64> @stack_fold_pmullq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmullq_maskz - ;CHECK: vpmullq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmullq_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <8 x i64> %a0, %a1 %3 = bitcast i8 %mask to <8 x i1> @@ -2989,8 +5393,15 @@ define <8 x i64> @stack_fold_pmullq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask } define <8 x i64> @stack_fold_pmullq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmullq_maskz_commuted: - ;CHECK: vpmullq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmullq_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <8 x i64> %a1, %a0 %3 = bitcast i8 %mask to <8 x i1> @@ -2999,24 +5410,45 @@ define <8 x i64> @stack_fold_pmullq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, } define <32 x i16> @stack_fold_pmullw(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_pmullw - ;CHECK: vpmullw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmullw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <32 x i16> %a0, %a1 ret <32 x i16> %2 } define <32 x i16> @stack_fold_pmullw_commuted(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_pmullw_commuted: - ;CHECK: vpmullw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmullw_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <32 x i16> %a1, %a0 ret <32 x i16> %2 } define <32 x i16> @stack_fold_pmullw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { - ;CHECK-LABEL: stack_fold_pmullw_mask - ;CHECK: vpmullw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmullw_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <32 x i16> %a0, %a1 %3 = bitcast i32 %mask to <32 x i1> @@ -3027,8 +5459,17 @@ define <32 x i16> @stack_fold_pmullw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x } define <32 x i16> @stack_fold_pmullw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { - ;CHECK-LABEL: stack_fold_pmullw_mask_commuted: - ;CHECK: vpmullw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmullw_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <32 x i16> %a1, %a0 %3 = bitcast i32 %mask to <32 x i1> @@ -3039,8 +5480,15 @@ define <32 x i16> @stack_fold_pmullw_mask_commuted(<32 x i16> %a0, <32 x i16> %a } define <32 x i16> @stack_fold_pmullw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_pmullw_maskz - ;CHECK: vpmullw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmullw_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <32 x i16> %a0, %a1 %3 = bitcast i32 %mask to <32 x i1> @@ -3049,8 +5497,15 @@ define <32 x i16> @stack_fold_pmullw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 % } define <32 x i16> @stack_fold_pmullw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { - ;CHECK-LABEL: stack_fold_pmullw_maskz_commuted: - ;CHECK: vpmullw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmullw_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = mul <32 x i16> %a1, %a0 %3 = bitcast i32 %mask to <32 x i1> @@ -3059,8 +5514,14 @@ define <32 x i16> @stack_fold_pmullw_maskz_commuted(<32 x i16> %a0, <32 x i16> % } define <8 x i64> @stack_fold_pmuldq(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pmuldq - ;CHECK: vpmuldq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmuldq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> @@ -3071,8 +5532,14 @@ define <8 x i64> @stack_fold_pmuldq(<8 x i64> %a0, <8 x i64> %a1) { } define <8 x i64> @stack_fold_pmuldq_commuted(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pmuldq_commuted: - ;CHECK: vpmuldq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmuldq_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> @@ -3083,8 +5550,17 @@ define <8 x i64> @stack_fold_pmuldq_commuted(<8 x i64> %a0, <8 x i64> %a1) { } define <8 x i64> @stack_fold_pmuldq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmuldq_mask - ;CHECK: vpmuldq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmuldq_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> @@ -3099,8 +5575,17 @@ define <8 x i64> @stack_fold_pmuldq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> } define <8 x i64> @stack_fold_pmuldq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmuldq_mask_commuted: - ;CHECK: vpmuldq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmuldq_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> @@ -3115,8 +5600,15 @@ define <8 x i64> @stack_fold_pmuldq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, } define <8 x i64> @stack_fold_pmuldq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmuldq_maskz - ;CHECK: vpmuldq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmuldq_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> @@ -3129,8 +5621,15 @@ define <8 x i64> @stack_fold_pmuldq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask } define <8 x i64> @stack_fold_pmuldq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmuldq_maskz_commuted: - ;CHECK: vpmuldq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmuldq_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> @@ -3146,8 +5645,14 @@ define <8 x i64> @stack_fold_pmuldq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, define <8 x i64> @stack_fold_pmuludq(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pmuludq - ;CHECK: vpmuludq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmuludq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> @@ -3156,8 +5661,14 @@ define <8 x i64> @stack_fold_pmuludq(<8 x i64> %a0, <8 x i64> %a1) { } define <8 x i64> @stack_fold_pmuludq_commuted(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pmuludq_commuted: - ;CHECK: vpmuludq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmuludq_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> @@ -3166,8 +5677,17 @@ define <8 x i64> @stack_fold_pmuludq_commuted(<8 x i64> %a0, <8 x i64> %a1) { } define <8 x i64> @stack_fold_pmuludq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmuludq_mask - ;CHECK: vpmuludq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmuludq_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> @@ -3180,8 +5700,17 @@ define <8 x i64> @stack_fold_pmuludq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64 } define <8 x i64> @stack_fold_pmuludq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmuludq_mask_commuted: - ;CHECK: vpmuludq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmuludq_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> @@ -3194,8 +5723,15 @@ define <8 x i64> @stack_fold_pmuludq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, } define <8 x i64> @stack_fold_pmuludq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmuludq_maskz - ;CHECK: vpmuludq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmuludq_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> @@ -3206,8 +5742,15 @@ define <8 x i64> @stack_fold_pmuludq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mas } define <8 x i64> @stack_fold_pmuludq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pmuludq_maskz_commuted: - ;CHECK: vpmuludq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pmuludq_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> @@ -3218,8 +5761,14 @@ define <8 x i64> @stack_fold_pmuludq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1 } define <16 x i32> @stack_fold_vpopcntd(<16 x i32> %a0) { - ;CHECK-LABEL: stack_fold_vpopcntd - ;CHECK: vpopcntd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_vpopcntd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a0) ret <16 x i32> %2 @@ -3227,8 +5776,14 @@ define <16 x i32> @stack_fold_vpopcntd(<16 x i32> %a0) { declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readonly define <8 x i64> @stack_fold_vpopcntq(<8 x i64> %a0) { - ;CHECK-LABEL: stack_fold_vpopcntq - ;CHECK: vpopcntq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_vpopcntq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a0) ret <8 x i64> %2 @@ -3236,24 +5791,45 @@ define <8 x i64> @stack_fold_vpopcntq(<8 x i64> %a0) { declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone define <16 x i32> @stack_fold_pord(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pord - ;CHECK: vorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pord: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <16 x i32> %a0, %a1 ret <16 x i32> %2 } define <16 x i32> @stack_fold_pord_commuted(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pord_commuted: - ;CHECK: vorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pord_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <16 x i32> %a1, %a0 ret <16 x i32> %2 } define <16 x i32> @stack_fold_pord_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { - ;CHECK-LABEL: stack_fold_pord_mask - ;CHECK: vorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pord_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovaps %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <16 x i32> %a0, %a1 %3 = bitcast i16 %mask to <16 x i1> @@ -3264,8 +5840,17 @@ define <16 x i32> @stack_fold_pord_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i3 } define <16 x i32> @stack_fold_pord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { - ;CHECK-LABEL: stack_fold_pord_mask_commuted: - ;CHECK: vorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pord_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovaps %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <16 x i32> %a1, %a0 %3 = bitcast i16 %mask to <16 x i1> @@ -3276,8 +5861,15 @@ define <16 x i32> @stack_fold_pord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, } define <16 x i32> @stack_fold_pord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pord_maskz - ;CHECK: vorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pord_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <16 x i32> %a0, %a1 %3 = bitcast i16 %mask to <16 x i1> @@ -3286,8 +5878,15 @@ define <16 x i32> @stack_fold_pord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %ma } define <16 x i32> @stack_fold_pord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pord_maskz_commuted: - ;CHECK: vorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pord_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <16 x i32> %a1, %a0 %3 = bitcast i16 %mask to <16 x i1> @@ -3296,24 +5895,45 @@ define <16 x i32> @stack_fold_pord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1 } define <8 x i64> @stack_fold_porq(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_porq - ;CHECK: vorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_porq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <8 x i64> %a0, %a1 ret <8 x i64> %2 } define <8 x i64> @stack_fold_porq_commuted(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_porq_commuted: - ;CHECK: vorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_porq_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <8 x i64> %a1, %a0 ret <8 x i64> %2 } define <8 x i64> @stack_fold_porq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { - ;CHECK-LABEL: stack_fold_porq_mask - ;CHECK: vorpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_porq_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovapd %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovapd (%rdi), %zmm0 +; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <8 x i64> %a0, %a1 %3 = bitcast i8 %mask to <8 x i1> @@ -3324,8 +5944,17 @@ define <8 x i64> @stack_fold_porq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* } define <8 x i64> @stack_fold_porq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { - ;CHECK-LABEL: stack_fold_porq_mask_commuted: - ;CHECK: vorpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_porq_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovapd %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovapd (%rdi), %zmm0 +; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <8 x i64> %a1, %a0 %3 = bitcast i8 %mask to <8 x i1> @@ -3336,8 +5965,15 @@ define <8 x i64> @stack_fold_porq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 } define <8 x i64> @stack_fold_porq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_porq_maskz - ;CHECK: vorpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_porq_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <8 x i64> %a0, %a1 %3 = bitcast i8 %mask to <8 x i1> @@ -3346,8 +5982,15 @@ define <8 x i64> @stack_fold_porq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) } define <8 x i64> @stack_fold_porq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_porq_maskz_commuted: - ;CHECK: vorpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_porq_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <8 x i64> %a1, %a0 %3 = bitcast i8 %mask to <8 x i1> @@ -3356,8 +5999,14 @@ define <8 x i64> @stack_fold_porq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i } define <8 x i64> @stack_fold_psadbw(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_psadbw - ;CHECK: vpsadbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psadbw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %a0, <64 x i8> %a1) ret <8 x i64> %2 @@ -3365,8 +6014,14 @@ define <8 x i64> @stack_fold_psadbw(<64 x i8> %a0, <64 x i8> %a1) { declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) nounwind readnone define <64 x i8> @stack_fold_pshufb_zmm(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pshufb_zmm - ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pshufb_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1) ret <64 x i8> %2 @@ -3374,8 +6029,17 @@ define <64 x i8> @stack_fold_pshufb_zmm(<64 x i8> %a0, <64 x i8> %a1) { declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>) define <64 x i8> @stack_fold_pshufb_zmm_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_pshufb_zmm_mask - ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pshufb_zmm_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = load <64 x i8>, <64 x i8>* %passthru %3 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1) @@ -3385,8 +6049,15 @@ define <64 x i8> @stack_fold_pshufb_zmm_mask(<64 x i8>* %passthru, <64 x i8> %a0 } define <64 x i8> @stack_fold_pshufb_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_pshufb_zmm_maskz - ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pshufb_zmm_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1) %3 = bitcast i64 %mask to <64 x i1> @@ -3395,8 +6066,17 @@ define <64 x i8> @stack_fold_pshufb_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 } define <16 x i32> @stack_fold_pshufd_zmm(<16 x i32> %a0) { - ;CHECK-LABEL: stack_fold_pshufd_zmm - ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pshufd_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> @@ -3404,8 +6084,22 @@ define <16 x i32> @stack_fold_pshufd_zmm(<16 x i32> %a0) { } define <16 x i32> @stack_fold_pshufd_zmm_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) { - ;CHECK-LABEL: stack_fold_pshufd_zmm_mask - ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pshufd_zmm_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> %3 = bitcast i16 %mask to <16 x i1> @@ -3414,8 +6108,16 @@ define <16 x i32> @stack_fold_pshufd_zmm_mask(<16 x i32> %passthru, <16 x i32> % } define <16 x i32> @stack_fold_pshufd_zmm_maskz(<16 x i32> %a0, i16 %mask) { - ;CHECK-LABEL: stack_fold_pshufd_zmm_maskz - ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pshufd_zmm_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> %3 = bitcast i16 %mask to <16 x i1> @@ -3424,16 +6126,37 @@ define <16 x i32> @stack_fold_pshufd_zmm_maskz(<16 x i32> %a0, i16 %mask) { } define <32 x i16> @stack_fold_pshufhw_zmm(<32 x i16> %a0) { - ;CHECK-LABEL: stack_fold_pshufhw_zmm - ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pshufhw_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28] +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28> ret <32 x i16> %2 } define <32 x i16> @stack_fold_pshufhw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) { - ;CHECK-LABEL: stack_fold_pshufhw_zmm_mask - ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pshufhw_zmm_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 {%k1} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28> %3 = bitcast i32 %mask to <32 x i1> @@ -3442,8 +6165,16 @@ define <32 x i16> @stack_fold_pshufhw_zmm_mask(<32 x i16> %passthru, <32 x i16> } define <32 x i16> @stack_fold_pshufhw_zmm_maskz(<32 x i16> %a0, i32 %mask) { - ;CHECK-LABEL: stack_fold_pshufhw_zmm_maskz - ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pshufhw_zmm_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 {%k1} {z} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28] +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28> %3 = bitcast i32 %mask to <32 x i1> @@ -3452,16 +6183,37 @@ define <32 x i16> @stack_fold_pshufhw_zmm_maskz(<32 x i16> %a0, i32 %mask) { } define <32 x i16> @stack_fold_pshuflw_zmm(<32 x i16> %a0) { - ;CHECK-LABEL: stack_fold_pshuflw_zmm - ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pshuflw_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31] +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31> ret <32 x i16> %2 } define <32 x i16> @stack_fold_pshuflw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) { - ;CHECK-LABEL: stack_fold_pshuflw_zmm_mask - ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pshuflw_zmm_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31> %3 = bitcast i32 %mask to <32 x i1> @@ -3470,8 +6222,16 @@ define <32 x i16> @stack_fold_pshuflw_zmm_mask(<32 x i16> %passthru, <32 x i16> } define <32 x i16> @stack_fold_pshuflw_zmm_maskz(<32 x i16> %a0, i32 %mask) { - ;CHECK-LABEL: stack_fold_pshuflw_zmm_maskz - ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pshuflw_zmm_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31] +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31> %3 = bitcast i32 %mask to <32 x i1> @@ -3480,8 +6240,14 @@ define <32 x i16> @stack_fold_pshuflw_zmm_maskz(<32 x i16> %a0, i32 %mask) { } define <16 x i32> @stack_fold_pslld(<16 x i32> %a0, <4 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pslld - ;CHECK: vpslld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_pslld: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ret <16 x i32> %2 @@ -3489,8 +6255,17 @@ define <16 x i32> @stack_fold_pslld(<16 x i32> %a0, <4 x i32> %a1) { declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone define <16 x i32> @stack_fold_pslld_mask(<16 x i32>* %passthru, <16 x i32> %a0, <4 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pslld_mask - ;CHECK: vpslld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_pslld_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) %3 = bitcast i16 %mask to <16 x i1> @@ -3500,8 +6275,15 @@ define <16 x i32> @stack_fold_pslld_mask(<16 x i32>* %passthru, <16 x i32> %a0, } define <16 x i32> @stack_fold_pslld_maskz(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pslld_maskz - ;CHECK: vpslld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_pslld_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 16-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) %3 = bitcast i16 %mask to <16 x i1> @@ -3510,8 +6292,14 @@ define <16 x i32> @stack_fold_pslld_maskz(<16 x i32> %a0, <4 x i32> %a1, i16 %ma } define <16 x i32> @stack_fold_pslldi(<16 x i32> %a0) { - ;CHECK-LABEL: stack_fold_pslldi - ;CHECK: vpslld $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pslldi: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1) ret <16 x i32> %2 @@ -3519,8 +6307,17 @@ define <16 x i32> @stack_fold_pslldi(<16 x i32> %a0) { declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone define <16 x i32> @stack_fold_pslldi_mask(<16 x i32>* %passthru, <16 x i32> %a0, i16 %mask) { - ;CHECK-LABEL: stack_fold_pslldi_mask - ;CHECK: vpslld $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pslldi_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 +; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1) %3 = bitcast i16 %mask to <16 x i1> @@ -3530,8 +6327,15 @@ define <16 x i32> @stack_fold_pslldi_mask(<16 x i32>* %passthru, <16 x i32> %a0, } define <16 x i32> @stack_fold_pslldi_maskz(<16 x i32> %a0, i16 %mask) { - ;CHECK-LABEL: stack_fold_pslldi_maskz - ;CHECK: vpslld $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pslldi_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1) %3 = bitcast i16 %mask to <16 x i1> @@ -3540,16 +6344,29 @@ define <16 x i32> @stack_fold_pslldi_maskz(<16 x i32> %a0, i16 %mask) { } define <64 x i8> @stack_fold_pslldq(<64 x i8> %a, <64 x i8> %b) { - ;CHECK-LABEL: stack_fold_pslldq - ;CHECK: vpslldq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pslldq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpslldq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 = zero,mem[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,mem[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,mem[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,mem[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62] +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62> ret <64 x i8> %2 } define <8 x i64> @stack_fold_psllq(<8 x i64> %a0, <2 x i64> %a1) { - ;CHECK-LABEL: stack_fold_psllq - ;CHECK: vpsllq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_psllq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ret <8 x i64> %2 @@ -3557,8 +6374,14 @@ define <8 x i64> @stack_fold_psllq(<8 x i64> %a0, <2 x i64> %a1) { declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone define <8 x i64> @stack_fold_psllqi(<8 x i64> %a0) { - ;CHECK-LABEL: stack_fold_psllqi - ;CHECK: vpsllq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psllqi: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsllq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 1) ret <8 x i64> %2 @@ -3566,8 +6389,14 @@ define <8 x i64> @stack_fold_psllqi(<8 x i64> %a0) { declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone define <16 x i32> @stack_fold_psllvd(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_psllvd - ;CHECK: vpsllvd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psllvd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) ret <16 x i32> %2 @@ -3575,8 +6404,17 @@ define <16 x i32> @stack_fold_psllvd(<16 x i32> %a0, <16 x i32> %a1) { declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone define <16 x i32> @stack_fold_psllvd_mask(<16 x i32>* %passthru, <16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_psllvd_mask - ;CHECK: vpsllvd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psllvd_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) %3 = bitcast i16 %mask to <16 x i1> @@ -3586,8 +6424,15 @@ define <16 x i32> @stack_fold_psllvd_mask(<16 x i32>* %passthru, <16 x i32> %a0, } define <16 x i32> @stack_fold_psllvd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_psllvd_maskz - ;CHECK: vpsllvd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psllvd_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) %3 = bitcast i16 %mask to <16 x i1> @@ -3596,8 +6441,14 @@ define <16 x i32> @stack_fold_psllvd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 % } define <8 x i64> @stack_fold_psllvq(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_psllvq - ;CHECK: vpsllvq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psllvq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1) ret <8 x i64> %2 @@ -3605,8 +6456,14 @@ define <8 x i64> @stack_fold_psllvq(<8 x i64> %a0, <8 x i64> %a1) { declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone define <32 x i16> @stack_fold_psllvw(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_psllvw - ;CHECK: vpsllvw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psllvw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %a0, <32 x i16> %a1) ret <32 x i16> %2 @@ -3614,8 +6471,14 @@ define <32 x i16> @stack_fold_psllvw(<32 x i16> %a0, <32 x i16> %a1) { declare <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16>, <32 x i16>) nounwind readnone define <32 x i16> @stack_fold_psllw(<32 x i16> %a0, <8 x i16> %a1) { - ;CHECK-LABEL: stack_fold_psllw - ;CHECK: vpsllw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_psllw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1) ret <32 x i16> %2 @@ -3623,8 +6486,14 @@ define <32 x i16> @stack_fold_psllw(<32 x i16> %a0, <8 x i16> %a1) { declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) nounwind readnone define <32 x i16> @stack_fold_psllwi(<32 x i16> %a0) { - ;CHECK-LABEL: stack_fold_psllwi - ;CHECK: vpsllw $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psllwi: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsllw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 1) ret <32 x i16> %2 @@ -3632,8 +6501,14 @@ define <32 x i16> @stack_fold_psllwi(<32 x i16> %a0) { declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) nounwind readnone define <16 x i32> @stack_fold_psrad(<16 x i32> %a0, <4 x i32> %a1) { - ;CHECK-LABEL: stack_fold_psrad - ;CHECK: vpsrad {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_psrad: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ret <16 x i32> %2 @@ -3641,8 +6516,14 @@ define <16 x i32> @stack_fold_psrad(<16 x i32> %a0, <4 x i32> %a1) { declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone define <16 x i32> @stack_fold_psradi(<16 x i32> %a0) { - ;CHECK-LABEL: stack_fold_psradi - ;CHECK: vpsrad $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psradi: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsrad $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 1) ret <16 x i32> %2 @@ -3650,8 +6531,14 @@ define <16 x i32> @stack_fold_psradi(<16 x i32> %a0) { declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone define <8 x i64> @stack_fold_psraq(<8 x i64> %a0, <2 x i64> %a1) { - ;CHECK-LABEL: stack_fold_psraq - ;CHECK: vpsraq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_psraq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ret <8 x i64> %2 @@ -3659,8 +6546,14 @@ define <8 x i64> @stack_fold_psraq(<8 x i64> %a0, <2 x i64> %a1) { declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone define <8 x i64> @stack_fold_psraqi(<8 x i64> %a0) { - ;CHECK-LABEL: stack_fold_psraqi - ;CHECK: vpsraq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psraqi: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsraq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 1) ret <8 x i64> %2 @@ -3668,8 +6561,14 @@ define <8 x i64> @stack_fold_psraqi(<8 x i64> %a0) { declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone define <16 x i32> @stack_fold_psravd(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_psravd - ;CHECK: vpsravd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psravd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1) ret <16 x i32> %2 @@ -3677,8 +6576,14 @@ define <16 x i32> @stack_fold_psravd(<16 x i32> %a0, <16 x i32> %a1) { declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone define <8 x i64> @stack_fold_psravq(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_psravq - ;CHECK: vpsravq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psravq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1) ret <8 x i64> %2 @@ -3686,8 +6591,14 @@ define <8 x i64> @stack_fold_psravq(<8 x i64> %a0, <8 x i64> %a1) { declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone define <32 x i16> @stack_fold_psravw(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_psravw - ;CHECK: vpsravw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psravw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %a0, <32 x i16> %a1) ret <32 x i16> %2 @@ -3695,8 +6606,14 @@ define <32 x i16> @stack_fold_psravw(<32 x i16> %a0, <32 x i16> %a1) { declare <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16>, <32 x i16>) nounwind readnone define <32 x i16> @stack_fold_psraw(<32 x i16> %a0, <8 x i16> %a1) { - ;CHECK-LABEL: stack_fold_psraw - ;CHECK: vpsraw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_psraw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1) ret <32 x i16> %2 @@ -3704,8 +6621,14 @@ define <32 x i16> @stack_fold_psraw(<32 x i16> %a0, <8 x i16> %a1) { declare <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16>, <8 x i16>) nounwind readnone define <32 x i16> @stack_fold_psrawi(<32 x i16> %a0) { - ;CHECK-LABEL: stack_fold_psrawi - ;CHECK: vpsraw $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psrawi: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsraw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 1) ret <32 x i16> %2 @@ -3713,8 +6636,14 @@ define <32 x i16> @stack_fold_psrawi(<32 x i16> %a0) { declare <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16>, i32) nounwind readnone define <16 x i32> @stack_fold_psrld(<16 x i32> %a0, <4 x i32> %a1) { - ;CHECK-LABEL: stack_fold_psrld - ;CHECK: vpsrld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_psrld: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ret <16 x i32> %2 @@ -3722,8 +6651,14 @@ define <16 x i32> @stack_fold_psrld(<16 x i32> %a0, <4 x i32> %a1) { declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone define <16 x i32> @stack_fold_psrldi(<16 x i32> %a0) { - ;CHECK-LABEL: stack_fold_psrldi - ;CHECK: vpsrld $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psrldi: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsrld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 1) ret <16 x i32> %2 @@ -3731,16 +6666,29 @@ define <16 x i32> @stack_fold_psrldi(<16 x i32> %a0) { declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone define <64 x i8> @stack_fold_psrldq(<64 x i8> %a, <64 x i8> %b) { - ;CHECK-LABEL: stack_fold_psrldq - ;CHECK: vpsrldq $2, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psrldq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsrldq $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 = mem[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,mem[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,mem[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,mem[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 64, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 64, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 64, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 64> ret <64 x i8> %2 } define <8 x i64> @stack_fold_psrlq(<8 x i64> %a0, <2 x i64> %a1) { - ;CHECK-LABEL: stack_fold_psrlq - ;CHECK: vpsrlq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_psrlq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ret <8 x i64> %2 @@ -3748,8 +6696,14 @@ define <8 x i64> @stack_fold_psrlq(<8 x i64> %a0, <2 x i64> %a1) { declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone define <8 x i64> @stack_fold_psrlqi(<8 x i64> %a0) { - ;CHECK-LABEL: stack_fold_psrlqi - ;CHECK: vpsrlq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psrlqi: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsrlq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 1) ret <8 x i64> %2 @@ -3757,8 +6711,14 @@ define <8 x i64> @stack_fold_psrlqi(<8 x i64> %a0) { declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone define <16 x i32> @stack_fold_psrlvd(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_psrlvd - ;CHECK: vpsrlvd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psrlvd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1) ret <16 x i32> %2 @@ -3766,8 +6726,14 @@ define <16 x i32> @stack_fold_psrlvd(<16 x i32> %a0, <16 x i32> %a1) { declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone define <8 x i64> @stack_fold_psrlvq(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_psrlvq - ;CHECK: vpsrlvq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psrlvq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1) ret <8 x i64> %2 @@ -3775,8 +6741,14 @@ define <8 x i64> @stack_fold_psrlvq(<8 x i64> %a0, <8 x i64> %a1) { declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone define <32 x i16> @stack_fold_psrlvw(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_psrlvw - ;CHECK: vpsrlvw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psrlvw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %a0, <32 x i16> %a1) ret <32 x i16> %2 @@ -3784,8 +6756,14 @@ define <32 x i16> @stack_fold_psrlvw(<32 x i16> %a0, <32 x i16> %a1) { declare <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16>, <32 x i16>) nounwind readnone define <32 x i16> @stack_fold_psrlw(<32 x i16> %a0, <8 x i16> %a1) { - ;CHECK-LABEL: stack_fold_psrlw - ;CHECK: vpsrlw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +; CHECK-LABEL: stack_fold_psrlw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) ret <32 x i16> %2 @@ -3793,8 +6771,14 @@ define <32 x i16> @stack_fold_psrlw(<32 x i16> %a0, <8 x i16> %a1) { declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) nounwind readnone define <32 x i16> @stack_fold_psrlwi(<32 x i16> %a0) { - ;CHECK-LABEL: stack_fold_psrlwi - ;CHECK: vpsrlw $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psrlwi: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsrlw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 1) ret <32 x i16> %2 @@ -3802,80 +6786,157 @@ define <32 x i16> @stack_fold_psrlwi(<32 x i16> %a0) { declare <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16>, i32) nounwind readnone define <64 x i8> @stack_fold_psubb(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_psubb - ;CHECK: vpsubb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psubb: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = sub <64 x i8> %a0, %a1 ret <64 x i8> %2 } define <16 x i32> @stack_fold_psubd(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_psubd - ;CHECK: vpsubd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psubd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = sub <16 x i32> %a0, %a1 ret <16 x i32> %2 } define <8 x i64> @stack_fold_psubq(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_psubq - ;CHECK: vpsubq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psubq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = sub <8 x i64> %a0, %a1 ret <8 x i64> %2 } define <64 x i8> @stack_fold_psubsb(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_psubsb - ;CHECK: vpsubsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psubsb: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) ret <64 x i8> %2 } define <32 x i16> @stack_fold_psubsw(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_psubsw - ;CHECK: vpsubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psubsw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) ret <32 x i16> %2 } define <64 x i8> @stack_fold_psubusb(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_psubusb - ;CHECK: vpsubusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psubusb: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) ret <64 x i8> %2 } define <32 x i16> @stack_fold_psubusw(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_psubusw - ;CHECK: vpsubusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psubusw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) ret <32 x i16> %2 } define <32 x i16> @stack_fold_psubw(<32 x i16> %a0, <32 x i16> %a1) { - ;CHECK-LABEL: stack_fold_psubw - ;CHECK: vpsubw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_psubw: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = sub <32 x i16> %a0, %a1 ret <32 x i16> %2 } define <8 x i64> @stack_fold_shufi64x2(<8 x i64> %a, <8 x i64> %b) { - ;CHECK-LABEL: stack_fold_shufi64x2 - ;CHECK: vshufi64x2 $24, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_shufi64x2: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9> ret <8 x i64> %2 } define <8 x i64> @stack_fold_shufi64x2_mask(<8 x i64> %a, <8 x i64> %b, i8 %mask, <8 x i64>* %passthru) { - ;CHECK-LABEL: stack_fold_shufi64x2_mask - ;CHECK: vshufi64x2 $24, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_shufi64x2_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1] +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9> %3 = bitcast i8 %mask to <8 x i1> @@ -3886,8 +6947,22 @@ define <8 x i64> @stack_fold_shufi64x2_mask(<8 x i64> %a, <8 x i64> %b, i8 %mask } define <8 x i64> @stack_fold_shufi64x2_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask, <8 x i64>* %passthru) { - ;CHECK-LABEL: stack_fold_shufi64x2_maskz - ;CHECK: vshufi64x2 $24, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_shufi64x2_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9> %3 = bitcast i8 %mask to <8 x i1> @@ -3896,8 +6971,24 @@ define <8 x i64> @stack_fold_shufi64x2_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mas } define <16 x i32> @stack_fold_shufi32x4_mask(<16 x i32> %a, <16 x i32> %b, i16 %mask, <16 x i32>* %passthru) { - ;CHECK-LABEL: stack_fold_shufi32x4_mask - ;CHECK: vshufi32x4 $20, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_shufi32x4_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3] +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19> %3 = bitcast i16 %mask to <16 x i1> @@ -3908,8 +6999,22 @@ define <16 x i32> @stack_fold_shufi32x4_mask(<16 x i32> %a, <16 x i32> %b, i16 % } define <16 x i32> @stack_fold_shufi32x4_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) { - ;CHECK-LABEL: stack_fold_shufi32x4_maskz - ;CHECK: vshufi32x4 $20, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_shufi32x4_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19> %3 = bitcast i16 %mask to <16 x i1> @@ -3918,8 +7023,14 @@ define <16 x i32> @stack_fold_shufi32x4_maskz(<16 x i32> %a, <16 x i32> %b, i16 } define <16 x i32> @stack_fold_ternlogd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { - ;CHECK-LABEL: stack_fold_ternlogd - ;CHECK: vpternlogd $33, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_ternlogd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpternlogd $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) ret <16 x i32> %2 @@ -3927,8 +7038,14 @@ define <16 x i32> @stack_fold_ternlogd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32 declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) define <8 x i64> @stack_fold_ternlogq(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { - ;CHECK-LABEL: stack_fold_ternlogq - ;CHECK: vpternlogq $33, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_ternlogq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpternlogq $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) ret <8 x i64> %2 @@ -3937,16 +7054,33 @@ define <8 x i64> @stack_fold_ternlogq(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) define <64 x i8> @stack_fold_punpckhbw_zmm(<64 x i8> %a0, <64 x i8> %a1) { - ;CHECK-LABEL: stack_fold_punpckhbw_zmm - ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_punpckhbw_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63] +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> ret <64 x i8> %2 } define <64 x i8> @stack_fold_punpckhbw_mask_zmm(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_punpckhbw_mask_zmm - ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_punpckhbw_mask_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: # zmm2 {%k1} = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63] +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> %3 = bitcast i64 %mask to <64 x i1> @@ -3957,8 +7091,16 @@ define <64 x i8> @stack_fold_punpckhbw_mask_zmm(<64 x i8>* %passthru, <64 x i8> } define <64 x i8> @stack_fold_punpckhbw_maskz_zmm(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { - ;CHECK-LABEL: stack_fold_punpckhbw_maskz_zmm - ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_punpckhbw_maskz_zmm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63] +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> %3 = bitcast i64 %mask to <64 x i1> @@ -3967,24 +7109,45 @@ define <64 x i8> @stack_fold_punpckhbw_maskz_zmm(<64 x i8> %a0, <64 x i8> %a1, i } define <16 x i32> @stack_fold_pxord(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pxord - ;CHECK: vxorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pxord: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <16 x i32> %a0, %a1 ret <16 x i32> %2 } define <16 x i32> @stack_fold_pxord_commuted(<16 x i32> %a0, <16 x i32> %a1) { - ;CHECK-LABEL: stack_fold_pxord_commuted: - ;CHECK: vxorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pxord_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <16 x i32> %a1, %a0 ret <16 x i32> %2 } define <16 x i32> @stack_fold_pxord_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { - ;CHECK-LABEL: stack_fold_pxord_mask - ;CHECK: vxorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pxord_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovaps %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <16 x i32> %a0, %a1 %3 = bitcast i16 %mask to <16 x i1> @@ -3995,8 +7158,17 @@ define <16 x i32> @stack_fold_pxord_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i } define <16 x i32> @stack_fold_pxord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { - ;CHECK-LABEL: stack_fold_pxord_mask_commuted: - ;CHECK: vxorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pxord_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovaps %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <16 x i32> %a1, %a0 %3 = bitcast i16 %mask to <16 x i1> @@ -4007,8 +7179,15 @@ define <16 x i32> @stack_fold_pxord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1 } define <16 x i32> @stack_fold_pxord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pxord_maskz - ;CHECK: vxorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pxord_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <16 x i32> %a0, %a1 %3 = bitcast i16 %mask to <16 x i1> @@ -4017,8 +7196,15 @@ define <16 x i32> @stack_fold_pxord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %m } define <16 x i32> @stack_fold_pxord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { - ;CHECK-LABEL: stack_fold_pxord_maskz_commuted: - ;CHECK: vxorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pxord_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <16 x i32> %a1, %a0 %3 = bitcast i16 %mask to <16 x i1> @@ -4027,24 +7213,45 @@ define <16 x i32> @stack_fold_pxord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a } define <8 x i64> @stack_fold_pxorq(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pxorq - ;CHECK: vxorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pxorq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <8 x i64> %a0, %a1 ret <8 x i64> %2 } define <8 x i64> @stack_fold_pxorq_commuted(<8 x i64> %a0, <8 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pxorq_commuted: - ;CHECK: vxorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pxorq_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <8 x i64> %a1, %a0 ret <8 x i64> %2 } define <8 x i64> @stack_fold_pxorq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { - ;CHECK-LABEL: stack_fold_pxorq_mask - ;CHECK: vxorpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pxorq_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovapd %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovapd (%rdi), %zmm0 +; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <8 x i64> %a0, %a1 %3 = bitcast i8 %mask to <8 x i1> @@ -4055,8 +7262,17 @@ define <8 x i64> @stack_fold_pxorq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* } define <8 x i64> @stack_fold_pxorq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { - ;CHECK-LABEL: stack_fold_pxorq_mask_commuted: - ;CHECK: vxorpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pxorq_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovapd %zmm0, %zmm1 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vmovapd (%rdi), %zmm0 +; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <8 x i64> %a1, %a0 %3 = bitcast i8 %mask to <8 x i1> @@ -4067,8 +7283,15 @@ define <8 x i64> @stack_fold_pxorq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, < } define <8 x i64> @stack_fold_pxorq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pxorq_maskz - ;CHECK: vxorpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pxorq_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <8 x i64> %a0, %a1 %3 = bitcast i8 %mask to <8 x i1> @@ -4077,8 +7300,15 @@ define <8 x i64> @stack_fold_pxorq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) } define <8 x i64> @stack_fold_pxorq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { - ;CHECK-LABEL: stack_fold_pxorq_maskz_commuted: - ;CHECK: vxorpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload +; CHECK-LABEL: stack_fold_pxorq_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <8 x i64> %a1, %a0 %3 = bitcast i8 %mask to <8 x i1> |