diff options
| author | Craig Topper <craig.topper@gmail.com> | 2016-12-03 19:37:39 +0000 |
|---|---|---|
| committer | Craig Topper <craig.topper@gmail.com> | 2016-12-03 19:37:39 +0000 |
| commit | 9d16bfa0f51bc5066623aaa5438f8d7615a582ee (patch) | |
| tree | 6645af1d1bda86fd273bfbd69051ffae41a9ec89 /llvm/test/CodeGen | |
| parent | b55f620ebcdf24d0cd322219285f6a9c47156660 (diff) | |
| download | bcm5719-llvm-9d16bfa0f51bc5066623aaa5438f8d7615a582ee.tar.gz bcm5719-llvm-9d16bfa0f51bc5066623aaa5438f8d7615a582ee.zip | |
[AVX-512] Add many of the VPERM instructions to the load folding table. Move VPERMPDZri to the correct table.
llvm-svn: 288591
Diffstat (limited to 'llvm/test/CodeGen')
| -rw-r--r-- | llvm/test/CodeGen/X86/stack-folding-int-avx512.ll | 185 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll | 82 |
2 files changed, 266 insertions, 1 deletions
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll index efce19d8619..5203f8d3b8a 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll @@ -870,7 +870,190 @@ define <16 x i32> @stack_fold_pmaddwd_zmm_maskz(<16 x i32>* %passthru, <32 x i16 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1, <16 x i32> undef, i16 -1) %3 = bitcast i16 %mask to <16 x i1> - ; load needed to keep the operation from being scheduled about the asm block %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer ret <16 x i32> %4 } + +define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) { + ;CHECK-LABEL: stack_fold_permd + ;CHECK: vpermd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a1, <16 x i32> %a0, <16 x i32> undef, i16 -1) + ret <16 x i32> %2 +} +declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readonly + +define <8 x double> @stack_fold_permpd(<8 x double> %a0) { + ;CHECK-LABEL: stack_fold_permpd + ;CHECK: vpermpd $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> + ; fadd forces execution domain + %3 = fadd <8 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0> + ret <8 x double> %3 +} + +define <8 x double> @stack_fold_permpd_mask(<8 x double>* %passthru, <8 x double> %a0, i8 %mask) { + ;CHECK-LABEL: stack_fold_permpd_mask + ;CHECK: vpermpd $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> + %3 = bitcast i8 %mask to <8 x i1> + ; load needed to keep the operation from being scheduled above the asm block + %4 = load <8 x double>, <8 x double>* %passthru + %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4 + ; fadd forces execution domain + %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0> + ret <8 x double> %6 +} + +define <8 x double> @stack_fold_permpd_maskz(<8 x double> %a0, i8 %mask) { + ;CHECK-LABEL: stack_fold_permpd_maskz + ;CHECK: vpermpd $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer + ret <8 x double> %4 +} + +define <8 x double> @stack_fold_permpdvar(<8 x i64> %a0, <8 x double> %a1) { + ;CHECK-LABEL: stack_fold_permpdvar + ;CHECK: vpermpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a1, <8 x i64> %a0, <8 x double> undef, i8 -1) + ; fadd forces execution domain + %3 = fadd <8 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0> + ret <8 x double> %3 +} +declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8) nounwind readonly + +define <16 x float> @stack_fold_permps(<16 x i32> %a0, <16 x float> %a1) { + ;CHECK-LABEL: stack_fold_permps + ;CHECK: vpermps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a1, <16 x i32> %a0, <16 x float> undef, i16 -1) + ret <16 x float> %2 +} +declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16) nounwind readonly + +define <8 x i64> @stack_fold_permq(<8 x i64> %a0) { + ;CHECK-LABEL: stack_fold_permq + ;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> + ; add forces execution domain + %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> + ret <8 x i64> %3 +} + +define <8 x i64> @stack_fold_permq_mask(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) { + ;CHECK-LABEL: stack_fold_permq_mask + ;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> + %3 = bitcast i8 %mask to <8 x i1> + ; load needed to keep the operation from being scheduled above the asm block + %4 = load <8 x i64>, <8 x i64>* %passthru + %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 + ; add forces execution domain + %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> + ret <8 x i64> %6 +} + +define <8 x i64> @stack_fold_permq_maskz(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) { + ;CHECK-LABEL: stack_fold_permq_maskz + ;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer + ret <8 x i64> %4 +} + +define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) { + ;CHECK-LABEL: stack_fold_permqvar + ;CHECK: vpermq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0, <8 x i64> undef, i8 -1) + ; add forces execution domain + %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> + ret <8 x i64> %3 +} +declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readonly + +define <8 x i64> @stack_fold_permqvar_mask(<8 x i64>* %passthru, <8 x i64> %a0, <8 x i64> %a1, i8 %mask) { + ;CHECK-LABEL: stack_fold_permqvar_mask + ;CHECK: vpermq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0, <8 x i64> undef, i8 -1) + %3 = bitcast i8 %mask to <8 x i1> + ; load needed to keep the operation from being scheduled above the asm block + %4 = load <8 x i64>, <8 x i64>* %passthru + %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 + ; add forces execution domain + %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> + ret <8 x i64> %6 +} + +define <32 x i16> @stack_fold_permwvar(<32 x i16> %a0, <32 x i16> %a1) { + ;CHECK-LABEL: stack_fold_permwvar + ;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0, <32 x i16> undef, i32 -1) + ret <32 x i16> %2 +} +declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) nounwind readonly + +define <32 x i16> @stack_fold_permwvar_mask(<32 x i16>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) { + ;CHECK-LABEL: stack_fold_permwvar_mask + ;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0, <32 x i16> undef, i32 -1) + %3 = bitcast i32 %mask to <32 x i1> + ; load needed to keep the operation from being scheduled above the asm block + %4 = load <32 x i16>, <32 x i16>* %passthru + %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 + ret <32 x i16> %5 +} + +define <32 x i16> @stack_fold_permwvar_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { + ;CHECK-LABEL: stack_fold_permwvar_maskz + ;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0, <32 x i16> undef, i32 -1) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer + ret <32 x i16> %4 +} + +define <64 x i8> @stack_fold_permbvar(<64 x i8> %a0, <64 x i8> %a1) { + ;CHECK-LABEL: stack_fold_permbvar + ;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0, <64 x i8> undef, i64 -1) + ret <64 x i8> %2 +} +declare <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readonly + +define <64 x i8> @stack_fold_permbvar_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) { + ;CHECK-LABEL: stack_fold_permbvar_mask + ;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0, <64 x i8> undef, i64 -1) + %3 = bitcast i64 %mask to <64 x i1> + ; load needed to keep the operation from being scheduled above the asm block + %4 = load <64 x i8>, <64 x i8>* %passthru + %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 + ret <64 x i8> %5 +} + +define <64 x i8> @stack_fold_permbvar_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { + ;CHECK-LABEL: stack_fold_permbvar_maskz + ;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0, <64 x i8> undef, i64 -1) + %3 = bitcast i64 %mask to <64 x i1> + %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer + ret <64 x i8> %4 +} diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll index e1107870a7a..da269199c7f 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll @@ -1340,3 +1340,85 @@ define <8 x i32> @stack_fold_pmaddwd_ymm_maskz(<16 x i16> %a0, <16 x i16> %a1, i %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer ret <8 x i32> %4 } + +define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) { + ;CHECK-LABEL: stack_fold_permd + ;CHECK: vpermd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0) + ret <8 x i32> %2 +} +declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly + +define <4 x double> @stack_fold_permpd(<4 x double> %a0) { + ;CHECK-LABEL: stack_fold_permpd + ;CHECK: vpermpd $235, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3> + ; fadd forces execution domain + %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0> + ret <4 x double> %3 +} + +define <4 x double> @stack_fold_permpdvar(<4 x i64> %a0, <4 x double> %a1) { + ;CHECK-LABEL: stack_fold_permpdvar + ;CHECK: vpermpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a1, <4 x i64> %a0, <4 x double> undef, i8 -1) + ; fadd forces execution domain + %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0> + ret <4 x double> %3 +} +declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8) nounwind readonly + +define <8 x float> @stack_fold_permps(<8 x i32> %a0, <8 x float> %a1) { + ;CHECK-LABEL: stack_fold_permps + ;CHECK: vpermps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0) + ret <8 x float> %2 +} +declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly + +define <4 x i64> @stack_fold_permq(<4 x i64> %a0) { + ;CHECK-LABEL: stack_fold_permq + ;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3> + ; add forces execution domain + %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1> + ret <4 x i64> %3 +} + +define <4 x i64> @stack_fold_permqvar(<4 x i64> %a0, <4 x i64> %a1) { + ;CHECK-LABEL: stack_fold_permqvar + ;CHECK: vpermq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a1, <4 x i64> %a0, <4 x i64> undef, i8 -1) + ; add forces execution domain + %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1> + ret <4 x i64> %3 +} +declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) nounwind readonly + +define <16 x i16> @stack_fold_permwvar(<16 x i16> %a0, <16 x i16> %a1) { + ;CHECK-LABEL: stack_fold_permwvar + ;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a1, <16 x i16> %a0, <16 x i16> undef, i16 -1) + ; add forces execution domain + %3 = add <16 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + ret <16 x i16> %3 +} +declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) nounwind readonly + +define <32 x i8> @stack_fold_permbvar(<32 x i8> %a0, <32 x i8> %a1) { + ;CHECK-LABEL: stack_fold_permbvar + ;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a1, <32 x i8> %a0, <32 x i8> undef, i32 -1) + ; add forces execution domain + %3 = add <32 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> + ret <32 x i8> %3 +} +declare <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) nounwind readonly |

