summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/X86/X86InstrFoldTables.cpp74
-rw-r--r--llvm/lib/Target/X86/X86InstrFoldTables.h13
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp78
-rw-r--r--llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll233
4 files changed, 274 insertions, 124 deletions
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index d42fec3770c..4f98fdf5ea4 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -5245,6 +5245,69 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VXORPSZrrk, X86::VXORPSZrmk, 0 },
};
+static const X86MemoryFoldTableEntry BroadcastFoldTable2[] = {
+ { X86::VADDPDZ128rr, X86::VADDPDZ128rmb, TB_BCAST_SD },
+ { X86::VADDPDZ256rr, X86::VADDPDZ256rmb, TB_BCAST_SD },
+ { X86::VADDPDZrr, X86::VADDPDZrmb, TB_BCAST_SD },
+ { X86::VADDPSZ128rr, X86::VADDPSZ128rmb, TB_BCAST_SS },
+ { X86::VADDPSZ256rr, X86::VADDPSZ256rmb, TB_BCAST_SS },
+ { X86::VADDPSZrr, X86::VADDPSZrmb, TB_BCAST_SS },
+ { X86::VDIVPDZ128rr, X86::VDIVPDZ128rmb, TB_BCAST_SD },
+ { X86::VDIVPDZ256rr, X86::VDIVPDZ256rmb, TB_BCAST_SD },
+ { X86::VDIVPDZrr, X86::VDIVPDZrmb, TB_BCAST_SD },
+ { X86::VDIVPSZ128rr, X86::VDIVPSZ128rmb, TB_BCAST_SS },
+ { X86::VDIVPSZ256rr, X86::VDIVPSZ256rmb, TB_BCAST_SS },
+ { X86::VDIVPSZrr, X86::VDIVPSZrmb, TB_BCAST_SS },
+ { X86::VMULPDZ128rr, X86::VMULPDZ128rmb, TB_BCAST_SD },
+ { X86::VMULPDZ256rr, X86::VMULPDZ256rmb, TB_BCAST_SD },
+ { X86::VMULPDZrr, X86::VMULPDZrmb, TB_BCAST_SD },
+ { X86::VMULPSZ128rr, X86::VMULPSZ128rmb, TB_BCAST_SS },
+ { X86::VMULPSZ256rr, X86::VMULPSZ256rmb, TB_BCAST_SS },
+ { X86::VMULPSZrr, X86::VMULPSZrmb, TB_BCAST_SS },
+ { X86::VPADDDZ128rr, X86::VPADDDZ128rmb, TB_BCAST_D },
+ { X86::VPADDDZ256rr, X86::VPADDDZ256rmb, TB_BCAST_D },
+ { X86::VPADDDZrr, X86::VPADDDZrmb, TB_BCAST_D },
+ { X86::VPADDQZ128rr, X86::VPADDQZ128rmb, TB_BCAST_Q },
+ { X86::VPADDQZ256rr, X86::VPADDQZ256rmb, TB_BCAST_Q },
+ { X86::VPADDQZrr, X86::VPADDQZrmb, TB_BCAST_Q },
+ { X86::VPANDDZ128rr, X86::VPANDDZ128rmb, TB_BCAST_D },
+ { X86::VPANDDZ256rr, X86::VPANDDZ256rmb, TB_BCAST_D },
+ { X86::VPANDDZrr, X86::VPANDDZrmb, TB_BCAST_D },
+ { X86::VPANDNDZ128rr, X86::VPANDNDZ128rmb, TB_BCAST_D },
+ { X86::VPANDNDZ256rr, X86::VPANDNDZ256rmb, TB_BCAST_D },
+ { X86::VPANDNDZrr, X86::VPANDNDZrmb, TB_BCAST_D },
+ { X86::VPANDNQZ128rr, X86::VPANDNQZ128rmb, TB_BCAST_Q },
+ { X86::VPANDNQZ256rr, X86::VPANDNQZ256rmb, TB_BCAST_Q },
+ { X86::VPANDNQZrr, X86::VPANDNQZrmb, TB_BCAST_Q },
+ { X86::VPANDQZ128rr, X86::VPANDQZ128rmb, TB_BCAST_Q },
+ { X86::VPANDQZ256rr, X86::VPANDQZ256rmb, TB_BCAST_Q },
+ { X86::VPANDQZrr, X86::VPANDQZrmb, TB_BCAST_Q },
+ { X86::VPMULLDZ128rr, X86::VPMULLDZ128rmb, TB_BCAST_D },
+ { X86::VPMULLDZ256rr, X86::VPMULLDZ256rmb, TB_BCAST_D },
+ { X86::VPMULLDZrr, X86::VPMULLDZrmb, TB_BCAST_D },
+ { X86::VPMULLQZ128rr, X86::VPMULLQZ128rmb, TB_BCAST_Q },
+ { X86::VPMULLQZ256rr, X86::VPMULLQZ256rmb, TB_BCAST_Q },
+ { X86::VPMULLQZrr, X86::VPMULLQZrmb, TB_BCAST_Q },
+ { X86::VPORDZ128rr, X86::VPORDZ128rmb, TB_BCAST_D },
+ { X86::VPORDZ256rr, X86::VPORDZ256rmb, TB_BCAST_D },
+ { X86::VPORDZrr, X86::VPORDZrmb, TB_BCAST_D },
+ { X86::VPORQZ128rr, X86::VPORQZ128rmb, TB_BCAST_Q },
+ { X86::VPORQZ256rr, X86::VPORQZ256rmb, TB_BCAST_Q },
+ { X86::VPORQZrr, X86::VPORQZrmb, TB_BCAST_Q },
+ { X86::VPXORDZ128rr, X86::VPXORDZ128rmb, TB_BCAST_D },
+ { X86::VPXORDZ256rr, X86::VPXORDZ256rmb, TB_BCAST_D },
+ { X86::VPXORDZrr, X86::VPXORDZrmb, TB_BCAST_D },
+ { X86::VPXORQZ128rr, X86::VPXORQZ128rmb, TB_BCAST_Q },
+ { X86::VPXORQZ256rr, X86::VPXORQZ256rmb, TB_BCAST_Q },
+ { X86::VPXORQZrr, X86::VPXORQZrmb, TB_BCAST_Q },
+ { X86::VSUBPDZ128rr, X86::VSUBPDZ128rmb, TB_BCAST_SD },
+ { X86::VSUBPDZ256rr, X86::VSUBPDZ256rmb, TB_BCAST_SD },
+ { X86::VSUBPDZrr, X86::VSUBPDZrmb, TB_BCAST_SD },
+ { X86::VSUBPSZ128rr, X86::VSUBPSZ128rmb, TB_BCAST_SS },
+ { X86::VSUBPSZ256rr, X86::VSUBPSZ256rmb, TB_BCAST_SS },
+ { X86::VSUBPSZrr, X86::VSUBPSZrmb, TB_BCAST_SS },
+};
+
static const X86MemoryFoldTableEntry *
lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) {
#ifndef NDEBUG
@@ -5287,6 +5350,12 @@ lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) {
std::end(MemoryFoldTable4)) ==
std::end(MemoryFoldTable4) &&
"MemoryFoldTable4 is not sorted and unique!");
+ assert(std::is_sorted(std::begin(BroadcastFoldTable2),
+ std::end(BroadcastFoldTable2)) &&
+ std::adjacent_find(std::begin(BroadcastFoldTable2),
+ std::end(BroadcastFoldTable2)) ==
+ std::end(BroadcastFoldTable2) &&
+ "BroadcastFoldTable2 is not sorted and unique!");
FoldTablesChecked.store(true, std::memory_order_relaxed);
}
#endif
@@ -5355,6 +5424,11 @@ struct X86MemUnfoldTable {
// Index 4, folded load
addTableEntry(Entry, TB_INDEX_4 | TB_FOLDED_LOAD);
+ // Broadcast tables.
+ for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable2)
+ // Index 2, folded broadcast
+ addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD | TB_FOLDED_BCAST);
+
// Sort the memory->reg unfold table.
array_pod_sort(Table.begin(), Table.end());
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.h b/llvm/lib/Target/X86/X86InstrFoldTables.h
index 4efbeb9f0be..7dc236a0d7e 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.h
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.h
@@ -38,7 +38,7 @@ enum {
TB_FOLDED_LOAD = 1 << 5,
TB_FOLDED_STORE = 1 << 6,
- // Unused bit 7
+ TB_FOLDED_BCAST = 1 << 7,
// Minimum alignment required for load/store.
// Used for RegOp->MemOp conversion. Encoded as Log2(Align) + 1 to allow 0
@@ -51,7 +51,16 @@ enum {
TB_ALIGN_64 = 7 << TB_ALIGN_SHIFT,
TB_ALIGN_MASK = 0xf << TB_ALIGN_SHIFT,
- // Unused bits 12-15
+ // Broadcast type.
+ // (stored in bits 12 - 13)
+ TB_BCAST_TYPE_SHIFT = 12,
+ TB_BCAST_D = 0 << TB_BCAST_TYPE_SHIFT,
+ TB_BCAST_Q = 1 << TB_BCAST_TYPE_SHIFT,
+ TB_BCAST_SS = 2 << TB_BCAST_TYPE_SHIFT,
+ TB_BCAST_SD = 3 << TB_BCAST_TYPE_SHIFT,
+ TB_BCAST_MASK = 0x3 << TB_BCAST_TYPE_SHIFT,
+
+ // Unused bits 14-15
};
// This struct is used for both the folding and unfold tables. They KeyOp
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 88c5ae2d416..d597ec90ea6 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -5311,6 +5311,51 @@ extractStoreMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) {
return StoreMMOs;
}
+static unsigned getBroadcastOpcode(const X86MemoryFoldTableEntry *I,
+ const TargetRegisterClass *RC,
+ const X86Subtarget &STI) {
+ assert(STI.hasAVX512() && "Expected at least AVX512!");
+ unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC);
+ assert((SpillSize == 64 || STI.hasVLX()) &&
+ "Can't broadcast less than 64 bytes without AVX512VL!");
+
+ switch (I->Flags & TB_BCAST_MASK) {
+ default: llvm_unreachable("Unexpected broadcast type!");
+ case TB_BCAST_D:
+ switch (SpillSize) {
+ default: llvm_unreachable("Unknown spill size");
+ case 16: return X86::VPBROADCASTDZ128m;
+ case 32: return X86::VPBROADCASTDZ256m;
+ case 64: return X86::VPBROADCASTDZm;
+ }
+ break;
+ case TB_BCAST_Q:
+ switch (SpillSize) {
+ default: llvm_unreachable("Unknown spill size");
+ case 16: return X86::VPBROADCASTQZ128m;
+ case 32: return X86::VPBROADCASTQZ256m;
+ case 64: return X86::VPBROADCASTQZm;
+ }
+ break;
+ case TB_BCAST_SS:
+ switch (SpillSize) {
+ default: llvm_unreachable("Unknown spill size");
+ case 16: return X86::VBROADCASTSSZ128m;
+ case 32: return X86::VBROADCASTSSZ256m;
+ case 64: return X86::VBROADCASTSSZm;
+ }
+ break;
+ case TB_BCAST_SD:
+ switch (SpillSize) {
+ default: llvm_unreachable("Unknown spill size");
+ case 16: return X86::VMOVDDUPZ128rm;
+ case 32: return X86::VBROADCASTSDZ256m;
+ case 64: return X86::VBROADCASTSDZm;
+ }
+ break;
+ }
+}
+
bool X86InstrInfo::unfoldMemoryOperand(
MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad,
bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
@@ -5321,6 +5366,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
unsigned Index = I->Flags & TB_INDEX_MASK;
bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
bool FoldedStore = I->Flags & TB_FOLDED_STORE;
+ bool FoldedBCast = I->Flags & TB_FOLDED_BCAST;
if (UnfoldLoad && !FoldedLoad)
return false;
UnfoldLoad &= FoldedLoad;
@@ -5329,6 +5375,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
UnfoldStore &= FoldedStore;
const MCInstrDesc &MCID = get(Opc);
+
const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
// TODO: Check if 32-byte or greater accesses are slow too?
@@ -5354,12 +5401,19 @@ bool X86InstrInfo::unfoldMemoryOperand(
AfterOps.push_back(Op);
}
- // Emit the load instruction.
+ // Emit the load or broadcast instruction.
if (UnfoldLoad) {
auto MMOs = extractLoadMMOs(MI.memoperands(), MF);
- unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
- bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
- unsigned Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget);
+
+ unsigned Opc;
+ if (FoldedBCast) {
+ Opc = getBroadcastOpcode(I, RC, Subtarget);
+ } else {
+ unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
+ bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+ Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget);
+ }
+
DebugLoc DL;
MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg);
for (unsigned i = 0, e = AddrOps.size(); i != e; ++i)
@@ -5460,6 +5514,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
unsigned Index = I->Flags & TB_INDEX_MASK;
bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
bool FoldedStore = I->Flags & TB_FOLDED_STORE;
+ bool FoldedBCast = I->Flags & TB_FOLDED_BCAST;
const MCInstrDesc &MCID = get(Opc);
MachineFunction &MF = DAG.getMachineFunction();
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
@@ -5493,10 +5548,17 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
return false;
// FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
// memory access is slow above.
- unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
- bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
- Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
- VT, MVT::Other, AddrOps);
+
+ unsigned Opc;
+ if (FoldedBCast) {
+ Opc = getBroadcastOpcode(I, RC, Subtarget);
+ } else {
+ unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
+ bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+ Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget);
+ }
+
+ Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps);
NewNodes.push_back(Load);
// Preserve memory reference information.
diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
index ed243ae800c..327941f7f81 100644
--- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
+++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
@@ -8,12 +8,12 @@ define void @bcast_unfold_add_v16i32(i32* %arg) {
; CHECK-LABEL: bcast_unfold_add_v16i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0
-; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB0_1
; CHECK-NEXT: # %bb.2: # %bb10
@@ -42,12 +42,12 @@ define void @bcast_unfold_add_v8i32(i32* %arg) {
; CHECK-LABEL: bcast_unfold_add_v8i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0
-; CHECK-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB1_1
; CHECK-NEXT: # %bb.2: # %bb10
@@ -76,12 +76,12 @@ define void @bcast_unfold_add_v4i32(i32* %arg) {
; CHECK-LABEL: bcast_unfold_add_v4i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB2_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0
-; CHECK-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %xmm0, %xmm1
+; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $16, %rax
; CHECK-NEXT: jne .LBB2_1
; CHECK-NEXT: # %bb.2: # %bb10
@@ -109,12 +109,12 @@ define void @bcast_unfold_add_v8i64(i64* %arg) {
; CHECK-LABEL: bcast_unfold_add_v8i64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB3_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0
-; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB3_1
; CHECK-NEXT: # %bb.2: # %bb10
@@ -143,12 +143,12 @@ define void @bcast_unfold_add_v4i64(i64* %arg) {
; CHECK-LABEL: bcast_unfold_add_v4i64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB4_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0
-; CHECK-NEXT: vpaddq {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB4_1
; CHECK-NEXT: # %bb.2: # %bb10
@@ -210,12 +210,12 @@ define void @bcast_unfold_mul_v16i32(i32* %arg) {
; CHECK-LABEL: bcast_unfold_mul_v16i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB6_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0
-; CHECK-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB6_1
; CHECK-NEXT: # %bb.2: # %bb10
@@ -244,12 +244,12 @@ define void @bcast_unfold_mul_v8i32(i32* %arg) {
; CHECK-LABEL: bcast_unfold_mul_v8i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB7_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0
-; CHECK-NEXT: vpmulld {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB7_1
; CHECK-NEXT: # %bb.2: # %bb10
@@ -278,12 +278,12 @@ define void @bcast_unfold_mul_v4i32(i32* %arg) {
; CHECK-LABEL: bcast_unfold_mul_v4i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,3,3,3]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB8_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0
-; CHECK-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %xmm0, %xmm1
+; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $16, %rax
; CHECK-NEXT: jne .LBB8_1
; CHECK-NEXT: # %bb.2: # %bb10
@@ -415,12 +415,12 @@ define void @bcast_unfold_or_v16i32(i32* %arg) {
; CHECK-LABEL: bcast_unfold_or_v16i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB12_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0
-; CHECK-NEXT: vpord {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpord 4096(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB12_1
; CHECK-NEXT: # %bb.2: # %bb10
@@ -449,12 +449,12 @@ define void @bcast_unfold_or_v8i32(i32* %arg) {
; CHECK-LABEL: bcast_unfold_or_v8i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB13_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0
-; CHECK-NEXT: vpord {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vorps 4096(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB13_1
; CHECK-NEXT: # %bb.2: # %bb10
@@ -483,12 +483,12 @@ define void @bcast_unfold_or_v4i32(i32* %arg) {
; CHECK-LABEL: bcast_unfold_or_v4i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB14_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0
-; CHECK-NEXT: vpord {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vorps 4096(%rdi,%rax), %xmm0, %xmm1
+; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $16, %rax
; CHECK-NEXT: jne .LBB14_1
; CHECK-NEXT: # %bb.2: # %bb10
@@ -516,12 +516,12 @@ define void @bcast_unfold_or_v8i64(i64* %arg) {
; CHECK-LABEL: bcast_unfold_or_v8i64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB15_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0
-; CHECK-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vporq 8192(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB15_1
; CHECK-NEXT: # %bb.2: # %bb10
@@ -550,12 +550,12 @@ define void @bcast_unfold_or_v4i64(i64* %arg) {
; CHECK-LABEL: bcast_unfold_or_v4i64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,3,3,3]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB16_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0
-; CHECK-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vorps 8192(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB16_1
; CHECK-NEXT: # %bb.2: # %bb10
@@ -617,12 +617,12 @@ define void @bcast_unfold_fneg_v16f32(float* %arg) {
; CHECK-LABEL: bcast_unfold_fneg_v16f32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB18_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0
-; CHECK-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpxord 4096(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB18_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -651,12 +651,12 @@ define void @bcast_unfold_fneg_v8f32(float* %arg) {
; CHECK-LABEL: bcast_unfold_fneg_v8f32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB19_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0
-; CHECK-NEXT: vpxord {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vxorps 4096(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB19_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -685,12 +685,12 @@ define void @bcast_unfold_fneg_v4f32(float* %arg) {
; CHECK-LABEL: bcast_unfold_fneg_v4f32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB20_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0
-; CHECK-NEXT: vpxord {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vxorps 4096(%rdi,%rax), %xmm0, %xmm1
+; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $16, %rax
; CHECK-NEXT: jne .LBB20_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -718,12 +718,12 @@ define void @bcast_unfold_fneg_v8f64(double* %arg) {
; CHECK-LABEL: bcast_unfold_fneg_v8f64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB21_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0
-; CHECK-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vpxorq 8192(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB21_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -752,12 +752,12 @@ define void @bcast_unfold_fneg_v4f64(double* %arg) {
; CHECK-LABEL: bcast_unfold_fneg_v4f64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB22_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0
-; CHECK-NEXT: vpxorq {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vxorps 8192(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB22_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -819,12 +819,12 @@ define void @bcast_unfold_fabs_v16f32(float* %arg) {
; CHECK-LABEL: bcast_unfold_fabs_v16f32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB24_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0
-; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vpandd 4096(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB24_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -856,12 +856,12 @@ define void @bcast_unfold_fabs_v8f32(float* %arg) {
; CHECK-LABEL: bcast_unfold_fabs_v8f32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB25_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0
-; CHECK-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vandps 4096(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB25_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -893,12 +893,12 @@ define void @bcast_unfold_fabs_v4f32(float* %arg) {
; CHECK-LABEL: bcast_unfold_fabs_v4f32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB26_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0
-; CHECK-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vandps 4096(%rdi,%rax), %xmm0, %xmm1
+; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $16, %rax
; CHECK-NEXT: jne .LBB26_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -929,12 +929,12 @@ define void @bcast_unfold_fabs_v8f64(double* %arg) {
; CHECK-LABEL: bcast_unfold_fabs_v8f64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB27_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0
-; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vpandq 8192(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB27_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -966,12 +966,12 @@ define void @bcast_unfold_fabs_v4f64(double* %arg) {
; CHECK-LABEL: bcast_unfold_fabs_v4f64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB28_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0
-; CHECK-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vandps 8192(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB28_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -1039,12 +1039,12 @@ define void @bcast_unfold_fadd_v16f32(float* nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fadd_v16f32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB30_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm0
-; CHECK-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; CHECK-NEXT: vmovups %zmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vaddps 4096(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB30_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -1073,12 +1073,12 @@ define void @bcast_unfold_fadd_v8f32(float* nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fadd_v8f32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB31_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm0
-; CHECK-NEXT: vaddps {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-NEXT: vmovups %ymm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vaddps 4096(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB31_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -1107,12 +1107,12 @@ define void @bcast_unfold_fadd_v4f32(float* nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fadd_v4f32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB32_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm0
-; CHECK-NEXT: vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-NEXT: vmovups %xmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vaddps 4096(%rdi,%rax), %xmm0, %xmm1
+; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $16, %rax
; CHECK-NEXT: jne .LBB32_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -1140,12 +1140,12 @@ define void @bcast_unfold_fadd_v8f64(double* nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fadd_v8f64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB33_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm0
-; CHECK-NEXT: vaddpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK-NEXT: vmovupd %zmm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB33_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -1174,12 +1174,12 @@ define void @bcast_unfold_fadd_v4f64(double* nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fadd_v4f64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB34_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm0
-; CHECK-NEXT: vaddpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; CHECK-NEXT: vmovupd %ymm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB34_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -1241,12 +1241,12 @@ define void @bcast_unfold_fmul_v16f32(float* nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fmul_v16f32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB36_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm0
-; CHECK-NEXT: vmulps {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; CHECK-NEXT: vmovups %zmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vmulps 4096(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB36_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -1275,12 +1275,12 @@ define void @bcast_unfold_fmul_v8f32(float* nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fmul_v8f32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB37_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm0
-; CHECK-NEXT: vmulps {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-NEXT: vmovups %ymm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vmulps 4096(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB37_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -1309,12 +1309,12 @@ define void @bcast_unfold_fmul_v4f32(float* nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fmul_v4f32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB38_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm0
-; CHECK-NEXT: vmulps {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-NEXT: vmovups %xmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vmulps 4096(%rdi,%rax), %xmm0, %xmm1
+; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $16, %rax
; CHECK-NEXT: jne .LBB38_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -1342,12 +1342,12 @@ define void @bcast_unfold_fmul_v8f64(double* nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fmul_v8f64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB39_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm0
-; CHECK-NEXT: vmulpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK-NEXT: vmovupd %zmm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %zmm0, %zmm1
+; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB39_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -1376,12 +1376,12 @@ define void @bcast_unfold_fmul_v4f64(double* nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fmul_v4f64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB40_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm0
-; CHECK-NEXT: vmulpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; CHECK-NEXT: vmovupd %ymm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %ymm0, %ymm1
+; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB40_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -1443,12 +1443,13 @@ define void @bcast_unfold_fdiv_v16f32(float* nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fdiv_v16f32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB42_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm0
-; CHECK-NEXT: vdivps {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; CHECK-NEXT: vmovups %zmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
+; CHECK-NEXT: vdivps %zmm0, %zmm1, %zmm1
+; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB42_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -1477,12 +1478,13 @@ define void @bcast_unfold_fdiv_v8f32(float* nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fdiv_v8f32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB43_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm0
-; CHECK-NEXT: vdivps {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-NEXT: vmovups %ymm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
+; CHECK-NEXT: vdivps %ymm0, %ymm1, %ymm1
+; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB43_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -1511,12 +1513,13 @@ define void @bcast_unfold_fdiv_v4f32(float* nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fdiv_v4f32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB44_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm0
-; CHECK-NEXT: vdivps {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-NEXT: vmovups %xmm0, 4096(%rdi,%rax)
+; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
+; CHECK-NEXT: vdivps %xmm0, %xmm1, %xmm1
+; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $16, %rax
; CHECK-NEXT: jne .LBB44_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -1544,12 +1547,13 @@ define void @bcast_unfold_fdiv_v8f64(double* nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fdiv_v8f64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB45_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm0
-; CHECK-NEXT: vdivpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK-NEXT: vmovupd %zmm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
+; CHECK-NEXT: vdivpd %zmm0, %zmm1, %zmm1
+; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB45_1
; CHECK-NEXT: # %bb.2: # %bb9
@@ -1578,12 +1582,13 @@ define void @bcast_unfold_fdiv_v4f64(double* nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fdiv_v4f64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB46_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm0
-; CHECK-NEXT: vdivpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; CHECK-NEXT: vmovupd %ymm0, 8192(%rdi,%rax)
+; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
+; CHECK-NEXT: vdivpd %ymm0, %ymm1, %ymm1
+; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB46_1
; CHECK-NEXT: # %bb.2: # %bb9
OpenPOWER on IntegriCloud