summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2019-09-28 01:56:36 +0000
committerCraig Topper <craig.topper@intel.com>2019-09-28 01:56:36 +0000
commit8b5ad3d16ec6c95e4d685bbdebeec7b5558254af (patch)
tree882b369779e9ad2829f701b7c8ba6da587be3088
parent82a707e94175bf9569f4dd0c0adda094ac046662 (diff)
downloadbcm5719-llvm-8b5ad3d16ec6c95e4d685bbdebeec7b5558254af.tar.gz
bcm5719-llvm-8b5ad3d16ec6c95e4d685bbdebeec7b5558254af.zip
[X86] Add broadcast load unfolding support for VPTESTMD/Q and VPTESTNMD/Q.
llvm-svn: 373138
-rw-r--r--llvm/lib/Target/X86/X86InstrFoldTables.cpp12
-rw-r--r--llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll150
2 files changed, 162 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index f02760ff63a..f3b286e0375 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -5374,6 +5374,18 @@ static const X86MemoryFoldTableEntry BroadcastFoldTable2[] = {
{ X86::VPORQZ128rr, X86::VPORQZ128rmb, TB_BCAST_Q },
{ X86::VPORQZ256rr, X86::VPORQZ256rmb, TB_BCAST_Q },
{ X86::VPORQZrr, X86::VPORQZrmb, TB_BCAST_Q },
+ { X86::VPTESTMDZ128rr, X86::VPTESTMDZ128rmb, TB_BCAST_D },
+ { X86::VPTESTMDZ256rr, X86::VPTESTMDZ256rmb, TB_BCAST_D },
+ { X86::VPTESTMDZrr, X86::VPTESTMDZrmb, TB_BCAST_D },
+ { X86::VPTESTMQZ128rr, X86::VPTESTMQZ128rmb, TB_BCAST_Q },
+ { X86::VPTESTMQZ256rr, X86::VPTESTMQZ256rmb, TB_BCAST_Q },
+ { X86::VPTESTMQZrr, X86::VPTESTMQZrmb, TB_BCAST_Q },
+ { X86::VPTESTNMDZ128rr,X86::VPTESTNMDZ128rmb,TB_BCAST_D },
+ { X86::VPTESTNMDZ256rr,X86::VPTESTNMDZ256rmb,TB_BCAST_D },
+ { X86::VPTESTNMDZrr, X86::VPTESTNMDZrmb, TB_BCAST_D },
+ { X86::VPTESTNMQZ128rr,X86::VPTESTNMQZ128rmb,TB_BCAST_Q },
+ { X86::VPTESTNMQZ256rr,X86::VPTESTNMQZ256rmb,TB_BCAST_Q },
+ { X86::VPTESTNMQZrr, X86::VPTESTNMQZrmb, TB_BCAST_Q },
{ X86::VPXORDZ128rr, X86::VPXORDZ128rmb, TB_BCAST_D },
{ X86::VPXORDZ256rr, X86::VPXORDZ256rmb, TB_BCAST_D },
{ X86::VPXORDZrr, X86::VPXORDZrmb, TB_BCAST_D },
diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
index ae7db38aad1..aa07a4db8d5 100644
--- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
+++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
@@ -4483,3 +4483,153 @@ define void @bcast_unfold_cmp_v8f32_refold(float* nocapture %0) {
12: ; preds = %2
ret void
}
+
+define void @bcast_unfold_ptestm_v4i32(i32* %arg) {
+; CHECK-LABEL: bcast_unfold_ptestm_v4i32:
+; CHECK: # %bb.0: # %bb
+; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB127_1: # %bb1
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
+; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
+; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
+; CHECK-NEXT: addq $16, %rax
+; CHECK-NEXT: jne .LBB127_1
+; CHECK-NEXT: # %bb.2: # %bb10
+; CHECK-NEXT: retq
+bb:
+ br label %bb1
+
+bb1: ; preds = %bb1, %bb
+ %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
+ %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
+ %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
+ %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
+ %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
+ %tmp5 = icmp ne <4 x i32> %tmp4b, zeroinitializer
+ %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
+ %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
+ store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
+ %tmp8 = add i64 %tmp, 4
+ %tmp9 = icmp eq i64 %tmp8, 1024
+ br i1 %tmp9, label %bb10, label %bb1
+
+bb10: ; preds = %bb1
+ ret void
+}
+
+define void @bcast_unfold_ptestnm_v4i32(i32* %arg) {
+; CHECK-LABEL: bcast_unfold_ptestnm_v4i32:
+; CHECK: # %bb.0: # %bb
+; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB128_1: # %bb1
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
+; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
+; CHECK-NEXT: addq $16, %rax
+; CHECK-NEXT: jne .LBB128_1
+; CHECK-NEXT: # %bb.2: # %bb10
+; CHECK-NEXT: retq
+bb:
+ br label %bb1
+
+bb1: ; preds = %bb1, %bb
+ %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
+ %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
+ %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
+ %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
+ %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
+ %tmp5 = icmp eq <4 x i32> %tmp4b, zeroinitializer
+ %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
+ %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
+ store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
+ %tmp8 = add i64 %tmp, 4
+ %tmp9 = icmp eq i64 %tmp8, 1024
+ br i1 %tmp9, label %bb10, label %bb1
+
+bb10: ; preds = %bb1
+ ret void
+}
+
+define void @bcast_unfold_ptestm_v4i64(i64* %arg) {
+; CHECK-LABEL: bcast_unfold_ptestm_v4i64:
+; CHECK: # %bb.0: # %bb
+; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB129_1: # %bb1
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
+; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
+; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
+; CHECK-NEXT: addq $32, %rax
+; CHECK-NEXT: jne .LBB129_1
+; CHECK-NEXT: # %bb.2: # %bb10
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+bb:
+ br label %bb1
+
+bb1: ; preds = %bb1, %bb
+ %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
+ %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
+ %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
+ %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
+ %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
+ %tmp5 = icmp ne <4 x i64> %tmp4b, zeroinitializer
+ %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
+ %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
+ store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
+ %tmp8 = add i64 %tmp, 4
+ %tmp9 = icmp eq i64 %tmp8, 1024
+ br i1 %tmp9, label %bb10, label %bb1
+
+bb10: ; preds = %bb1
+ ret void
+}
+
+define void @bcast_unfold_ptestnm_v4i64(i64* %arg) {
+; CHECK-LABEL: bcast_unfold_ptestnm_v4i64:
+; CHECK: # %bb.0: # %bb
+; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB130_1: # %bb1
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
+; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
+; CHECK-NEXT: addq $32, %rax
+; CHECK-NEXT: jne .LBB130_1
+; CHECK-NEXT: # %bb.2: # %bb10
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+bb:
+ br label %bb1
+
+bb1: ; preds = %bb1, %bb
+ %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
+ %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
+ %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
+ %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
+ %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
+ %tmp5 = icmp eq <4 x i64> %tmp4b, zeroinitializer
+ %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
+ %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
+ store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
+ %tmp8 = add i64 %tmp, 4
+ %tmp9 = icmp eq i64 %tmp8, 1024
+ br i1 %tmp9, label %bb10, label %bb1
+
+bb10: ; preds = %bb1
+ ret void
+}
OpenPOWER on IntegriCloud