diff options
author | Craig Topper <craig.topper@intel.com> | 2019-09-28 01:56:36 +0000 |
---|---|---|
committer | Craig Topper <craig.topper@intel.com> | 2019-09-28 01:56:36 +0000 |
commit | 8b5ad3d16ec6c95e4d685bbdebeec7b5558254af (patch) | |
tree | 882b369779e9ad2829f701b7c8ba6da587be3088 | |
parent | 82a707e94175bf9569f4dd0c0adda094ac046662 (diff) | |
download | bcm5719-llvm-8b5ad3d16ec6c95e4d685bbdebeec7b5558254af.tar.gz bcm5719-llvm-8b5ad3d16ec6c95e4d685bbdebeec7b5558254af.zip |
[X86] Add broadcast load unfolding support for VPTESTMD/Q and VPTESTNMD/Q.
llvm-svn: 373138
-rw-r--r-- | llvm/lib/Target/X86/X86InstrFoldTables.cpp | 12 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll | 150 |
2 files changed, 162 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp index f02760ff63a..f3b286e0375 100644 --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -5374,6 +5374,18 @@ static const X86MemoryFoldTableEntry BroadcastFoldTable2[] = { { X86::VPORQZ128rr, X86::VPORQZ128rmb, TB_BCAST_Q }, { X86::VPORQZ256rr, X86::VPORQZ256rmb, TB_BCAST_Q }, { X86::VPORQZrr, X86::VPORQZrmb, TB_BCAST_Q }, + { X86::VPTESTMDZ128rr, X86::VPTESTMDZ128rmb, TB_BCAST_D }, + { X86::VPTESTMDZ256rr, X86::VPTESTMDZ256rmb, TB_BCAST_D }, + { X86::VPTESTMDZrr, X86::VPTESTMDZrmb, TB_BCAST_D }, + { X86::VPTESTMQZ128rr, X86::VPTESTMQZ128rmb, TB_BCAST_Q }, + { X86::VPTESTMQZ256rr, X86::VPTESTMQZ256rmb, TB_BCAST_Q }, + { X86::VPTESTMQZrr, X86::VPTESTMQZrmb, TB_BCAST_Q }, + { X86::VPTESTNMDZ128rr,X86::VPTESTNMDZ128rmb,TB_BCAST_D }, + { X86::VPTESTNMDZ256rr,X86::VPTESTNMDZ256rmb,TB_BCAST_D }, + { X86::VPTESTNMDZrr, X86::VPTESTNMDZrmb, TB_BCAST_D }, + { X86::VPTESTNMQZ128rr,X86::VPTESTNMQZ128rmb,TB_BCAST_Q }, + { X86::VPTESTNMQZ256rr,X86::VPTESTNMQZ256rmb,TB_BCAST_Q }, + { X86::VPTESTNMQZrr, X86::VPTESTNMQZrmb, TB_BCAST_Q }, { X86::VPXORDZ128rr, X86::VPXORDZ128rmb, TB_BCAST_D }, { X86::VPXORDZ256rr, X86::VPXORDZ256rmb, TB_BCAST_D }, { X86::VPXORDZrr, X86::VPXORDZrmb, TB_BCAST_D }, diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll index ae7db38aad1..aa07a4db8d5 100644 --- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -4483,3 +4483,153 @@ define void @bcast_unfold_cmp_v8f32_refold(float* nocapture %0) { 12: ; preds = %2 ret void } + +define void @bcast_unfold_ptestm_v4i32(i32* %arg) { +; CHECK-LABEL: bcast_unfold_ptestm_v4i32: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB127_1: # %bb1 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 +; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k1 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1} +; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: jne .LBB127_1 +; CHECK-NEXT: # %bb.2: # %bb10 +; CHECK-NEXT: retq +bb: + br label %bb1 + +bb1: ; preds = %bb1, %bb + %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] + %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp + %tmp3 = bitcast i32* %tmp2 to <4 x i32>* + %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4 + %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2> + %tmp5 = icmp ne <4 x i32> %tmp4b, zeroinitializer + %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4 + %tmp7 = bitcast i32* %tmp2 to <4 x i32>* + store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4 + %tmp8 = add i64 %tmp, 4 + %tmp9 = icmp eq i64 %tmp8, 1024 + br i1 %tmp9, label %bb10, label %bb1 + +bb10: ; preds = %bb1 + ret void +} + +define void @bcast_unfold_ptestnm_v4i32(i32* %arg) { +; CHECK-LABEL: bcast_unfold_ptestnm_v4i32: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB128_1: # %bb1 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k1 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1} +; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: jne .LBB128_1 +; CHECK-NEXT: # %bb.2: # %bb10 +; CHECK-NEXT: retq +bb: + br label %bb1 + +bb1: ; preds = %bb1, %bb + %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] + %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp + %tmp3 = bitcast i32* %tmp2 to <4 x i32>* + %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4 + %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2> + %tmp5 = icmp eq <4 x i32> %tmp4b, zeroinitializer + %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4 + %tmp7 = bitcast i32* %tmp2 to <4 x i32>* + store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4 + %tmp8 = add i64 %tmp, 4 + %tmp9 = icmp eq i64 %tmp8, 1024 + br i1 %tmp9, label %bb10, label %bb1 + +bb10: ; preds = %bb1 + ret void +} + +define void @bcast_unfold_ptestm_v4i64(i64* %arg) { +; CHECK-LABEL: bcast_unfold_ptestm_v4i64: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB129_1: # %bb1 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 +; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k1 +; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1} +; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: jne .LBB129_1 +; CHECK-NEXT: # %bb.2: # %bb10 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +bb: + br label %bb1 + +bb1: ; preds = %bb1, %bb + %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] + %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp + %tmp3 = bitcast i64* %tmp2 to <4 x i64>* + %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8 + %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2> + %tmp5 = icmp ne <4 x i64> %tmp4b, zeroinitializer + %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4 + %tmp7 = bitcast i64* %tmp2 to <4 x i64>* + store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8 + %tmp8 = add i64 %tmp, 4 + %tmp9 = icmp eq i64 %tmp8, 1024 + br i1 %tmp9, label %bb10, label %bb1 + +bb10: ; preds = %bb1 + ret void +} + +define void @bcast_unfold_ptestnm_v4i64(i64* %arg) { +; CHECK-LABEL: bcast_unfold_ptestnm_v4i64: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB130_1: # %bb1 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k1 +; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1} +; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: jne .LBB130_1 +; CHECK-NEXT: # %bb.2: # %bb10 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +bb: + br label %bb1 + +bb1: ; preds = %bb1, %bb + %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] + %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp + %tmp3 = bitcast i64* %tmp2 to <4 x i64>* + %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8 + %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2> + %tmp5 = icmp eq <4 x i64> %tmp4b, zeroinitializer + %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4 + %tmp7 = bitcast i64* %tmp2 to <4 x i64>* + store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8 + %tmp8 = add i64 %tmp, 4 + %tmp9 = icmp eq i64 %tmp8, 1024 + br i1 %tmp9, label %bb10, label %bb1 + +bb10: ; preds = %bb1 + ret void +} |