diff options
Diffstat (limited to 'llvm')
25 files changed, 1028 insertions, 798 deletions
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h index ed6fc76c8bf..de6849a1eae 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h @@ -110,8 +110,6 @@ public: CodeGenOpt::Level OptLevel, bool IgnoreChains = false); - static void EnforceNodeIdInvariant(SDNode *N); - // Opcodes used by the DAG state machine: enum BuiltinOpcodes { OPC_Scope, @@ -201,28 +199,23 @@ protected: /// of the new node T. void ReplaceUses(SDValue F, SDValue T) { CurDAG->ReplaceAllUsesOfValueWith(F, T); - EnforceNodeIdInvariant(T.getNode()); } /// ReplaceUses - replace all uses of the old nodes F with the use /// of the new nodes T. void ReplaceUses(const SDValue *F, const SDValue *T, unsigned Num) { CurDAG->ReplaceAllUsesOfValuesWith(F, T, Num); - for (unsigned i = 0; i < Num; ++i) - EnforceNodeIdInvariant(T[i].getNode()); } /// ReplaceUses - replace all uses of the old node F with the use /// of the new node T. void ReplaceUses(SDNode *F, SDNode *T) { CurDAG->ReplaceAllUsesWith(F, T); - EnforceNodeIdInvariant(T); } /// Replace all uses of \c F with \c T, then remove \c F from the DAG. void ReplaceNode(SDNode *F, SDNode *T) { CurDAG->ReplaceAllUsesWith(F, T); - EnforceNodeIdInvariant(T); CurDAG->RemoveDeadNode(F); } diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index c919a593014..a2bead3c6ab 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -800,44 +800,16 @@ public: /// searches to be performed in parallel, caching of results across /// queries and incremental addition to Worklist. Stops early if N is /// found but will resume. Remember to clear Visited and Worklists - /// if DAG changes. MaxSteps gives a maximum number of nodes to visit before - /// giving up. The TopologicalPrune flag signals that positive NodeIds are - /// topologically ordered (Operands have strictly smaller node id) and search - /// can be pruned leveraging this. + /// if DAG changes. static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl<const SDNode *> &Visited, SmallVectorImpl<const SDNode *> &Worklist, - unsigned int MaxSteps = 0, - bool TopologicalPrune = false) { - SmallVector<const SDNode *, 8> DeferredNodes; + unsigned int MaxSteps = 0) { if (Visited.count(N)) return true; - - // Node Id's are assigned in three places: As a topological - // ordering (> 0), during legalization (results in values set to - // 0), new nodes (set to -1). If N has a topolgical id then we - // know that all nodes with ids smaller than it cannot be - // successors and we need not check them. Filter out all node - // that can't be matches. We add them to the worklist before exit - // in case of multiple calls. Note that during selection the topological id - // may be violated if a node's predecessor is selected before it. We mark - // this at selection negating the id of unselected successors and - // restricting topological pruning to positive ids. - - int NId = N->getNodeId(); - // If we Invalidated the Id, reconstruct original NId. - if (NId < -1) - NId = -(NId + 1); - - bool Found = false; while (!Worklist.empty()) { const SDNode *M = Worklist.pop_back_val(); - int MId = M->getNodeId(); - if (TopologicalPrune && M->getOpcode() != ISD::TokenFactor && (NId > 0) && - (MId > 0) && (MId < NId)) { - DeferredNodes.push_back(M); - continue; - } + bool Found = false; for (const SDValue &OpV : M->op_values()) { SDNode *Op = OpV.getNode(); if (Visited.insert(Op).second) @@ -846,16 +818,11 @@ public: Found = true; } if (Found) - break; + return true; if (MaxSteps != 0 && Visited.size() >= MaxSteps) - break; + return true; } - // Push deferred nodes back on worklist. - Worklist.append(DeferredNodes.begin(), DeferredNodes.end()); - // If we bailed early, conservatively return found. - if (MaxSteps != 0 && Visited.size() >= MaxSteps) - return true; - return Found; + return false; } /// Return true if all the users of N are contained in Nodes. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 414b3be77e1..067690af636 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -960,59 +960,6 @@ public: } // end anonymous namespace -// This function is used to enforce the topological node id property -// property leveraged during Instruction selection. Before selection all -// nodes are given a non-negative id such that all nodes have a larger id than -// their operands. As this holds transitively we can prune checks that a node N -// is a predecessor of M another by not recursively checking through M's -// operands if N's ID is larger than M's ID. This is significantly improves -// performance of for various legality checks (e.g. IsLegalToFold / -// UpdateChains). - -// However, when we fuse multiple nodes into a single node -// during selection we may induce a predecessor relationship between inputs and -// outputs of distinct nodes being merged violating the topological property. -// Should a fused node have a successor which has yet to be selected, our -// legality checks would be incorrect. To avoid this we mark all unselected -// sucessor nodes, i.e. id != -1 as invalid for pruning by bit-negating (x => -// (-(x+1))) the ids and modify our pruning check to ignore negative Ids of M. -// We use bit-negation to more clearly enforce that node id -1 can only be -// achieved by selected nodes). As the conversion is reversable the original Id, -// topological pruning can still be leveraged when looking for unselected nodes. -// This method is call internally in all ISel replacement calls. -void SelectionDAGISel::EnforceNodeIdInvariant(SDNode *Node) { - SmallVector<SDNode *, 4> OpNodes; - SmallVector<SDNode *, 4> Nodes; - SmallPtrSet<const SDNode *, 32> Visited; - OpNodes.push_back(Node); - - while (!OpNodes.empty()) { - SDNode *N = OpNodes.pop_back_val(); - for (const SDValue &Op : N->op_values()) { - if (Op->getNodeId() == -1 && Visited.insert(Op.getNode()).second) - OpNodes.push_back(Op.getNode()); - } - Nodes.push_back(N); - } - - Visited.clear(); - while (!Nodes.empty()) { - SDNode *N = Nodes.pop_back_val(); - - // Don't repeat work. - if (!Visited.insert(N).second) - continue; - for (auto *U : N->uses()) { - auto UId = U->getNodeId(); - if (UId > 0) { - int InvalidatedUId = -UId + 1; - U->setNodeId(InvalidatedUId); - Nodes.push_back(U); - } - } - } -} - void SelectionDAGISel::DoInstructionSelection() { DEBUG(dbgs() << "===== Instruction selection begins: " << printMBBReference(*FuncInfo->MBB) << " '" @@ -1048,33 +995,6 @@ void SelectionDAGISel::DoInstructionSelection() { if (Node->use_empty()) continue; -#ifndef NDEBUG - SmallVector<SDNode *, 4> Nodes; - Nodes.push_back(Node); - - while (!Nodes.empty()) { - auto N = Nodes.pop_back_val(); - if (Node->getOpcode() == ISD::TokenFactor || Node->getNodeId() < 0) - continue; - for (const SDValue &Op : N->op_values()) { - if (Op->getOpcode() == ISD::TokenFactor) - Nodes.push_back(Op.getNode()); - else { - // We rely on topological ordering of node ids for checking for - // cycles when fusing nodes during selection. All unselected nodes - // successors of an already selected node should have a negative id. - // This assertion will catch such cases. If this assertion triggers - // it is likely you using DAG-level Value/Node replacement functions - // (versus equivalent ISEL replacement) in backend-specific - // selections. See comment in EnforceNodeIdInvariant for more - // details. - assert(Op->getNodeId() != -1 && - "Node has already selected predecessor node"); - } - } - } -#endif - // When we are using non-default rounding modes or FP exception behavior // FP operations are represented by StrictFP pseudo-operations. They // need to be simplified here so that the target-specific instruction @@ -2242,44 +2162,54 @@ static SDNode *findGlueUse(SDNode *N) { return nullptr; } -/// findNonImmUse - Return true if "Def" is a predecessor of "Root" via a path -/// beyond "ImmedUse". We may ignore chains as they are checked separately. -static bool findNonImmUse(SDNode *Root, SDNode *Def, SDNode *ImmedUse, +/// findNonImmUse - Return true if "Use" is a non-immediate use of "Def". +/// This function iteratively traverses up the operand chain, ignoring +/// certain nodes. +static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse, + SDNode *Root, SmallPtrSetImpl<SDNode*> &Visited, bool IgnoreChains) { - SmallPtrSet<const SDNode *, 16> Visited; - SmallVector<const SDNode *, 16> WorkList; - // Only check if we have non-immediate uses of Def. - if (ImmedUse->isOnlyUserOf(Def)) - return false; - - // We don't care about paths to Def that go through ImmedUse so mark it - // visited and mark non-def operands as used. - Visited.insert(ImmedUse); - for (const SDValue &Op : ImmedUse->op_values()) { - SDNode *N = Op.getNode(); - // Ignore chain deps (they are validated by - // HandleMergeInputChains) and immediate uses - if ((Op.getValueType() == MVT::Other && IgnoreChains) || N == Def) + // The NodeID's are given uniques ID's where a node ID is guaranteed to be + // greater than all of its (recursive) operands. If we scan to a point where + // 'use' is smaller than the node we're scanning for, then we know we will + // never find it. + // + // The Use may be -1 (unassigned) if it is a newly allocated node. This can + // happen because we scan down to newly selected nodes in the case of glue + // uses. + std::vector<SDNode *> WorkList; + WorkList.push_back(Use); + + while (!WorkList.empty()) { + Use = WorkList.back(); + WorkList.pop_back(); + // NodeId topological order of TokenFactors is not guaranteed. Do not skip. + if (Use->getOpcode() != ISD::TokenFactor && + Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1) continue; - if (!Visited.insert(N).second) + + // Don't revisit nodes if we already scanned it and didn't fail, we know we + // won't fail if we scan it again. + if (!Visited.insert(Use).second) continue; - WorkList.push_back(N); - } - // Initialize worklist to operands of Root. - if (Root != ImmedUse) { - for (const SDValue &Op : Root->op_values()) { - SDNode *N = Op.getNode(); - // Ignore chains (they are validated by HandleMergeInputChains) - if ((Op.getValueType() == MVT::Other && IgnoreChains) || N == Def) - continue; - if (!Visited.insert(N).second) + for (const SDValue &Op : Use->op_values()) { + // Ignore chain uses, they are validated by HandleMergeInputChains. + if (Op.getValueType() == MVT::Other && IgnoreChains) continue; + + SDNode *N = Op.getNode(); + if (N == Def) { + if (Use == ImmedUse || Use == Root) + continue; // We are not looking for immediate use. + assert(N != Root); + return true; + } + + // Traverse up the operand chain. WorkList.push_back(N); } } - - return SDNode::hasPredecessorHelper(Def, Visited, WorkList, 0, true); + return false; } /// IsProfitableToFold - Returns true if it's profitable to fold the specific @@ -2351,12 +2281,13 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root, // If our query node has a glue result with a use, we've walked up it. If // the user (which has already been selected) has a chain or indirectly uses - // the chain, HandleMergeInputChains will not consider it. Because of + // the chain, our WalkChainUsers predicate will not consider it. Because of // this, we cannot ignore chains in this predicate. IgnoreChains = false; } - return !findNonImmUse(Root, N.getNode(), U, IgnoreChains); + SmallPtrSet<SDNode*, 16> Visited; + return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains); } void SelectionDAGISel::Select_INLINEASM(SDNode *N) { @@ -2460,7 +2391,7 @@ void SelectionDAGISel::UpdateChains( static_cast<SDNode *>(nullptr)); }); if (ChainNode->getOpcode() != ISD::TokenFactor) - ReplaceUses(ChainVal, InputChain); + CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain); // If the node became dead and we haven't already seen it, delete it. if (ChainNode != NodeToMatch && ChainNode->use_empty() && @@ -2475,6 +2406,143 @@ void SelectionDAGISel::UpdateChains( DEBUG(dbgs() << "ISEL: Match complete!\n"); } +enum ChainResult { + CR_Simple, + CR_InducesCycle, + CR_LeadsToInteriorNode +}; + +/// WalkChainUsers - Walk down the users of the specified chained node that is +/// part of the pattern we're matching, looking at all of the users we find. +/// This determines whether something is an interior node, whether we have a +/// non-pattern node in between two pattern nodes (which prevent folding because +/// it would induce a cycle) and whether we have a TokenFactor node sandwiched +/// between pattern nodes (in which case the TF becomes part of the pattern). +/// +/// The walk we do here is guaranteed to be small because we quickly get down to +/// already selected nodes "below" us. +static ChainResult +WalkChainUsers(const SDNode *ChainedNode, + SmallVectorImpl<SDNode *> &ChainedNodesInPattern, + DenseMap<const SDNode *, ChainResult> &TokenFactorResult, + SmallVectorImpl<SDNode *> &InteriorChainedNodes) { + ChainResult Result = CR_Simple; + + for (SDNode::use_iterator UI = ChainedNode->use_begin(), + E = ChainedNode->use_end(); UI != E; ++UI) { + // Make sure the use is of the chain, not some other value we produce. + if (UI.getUse().getValueType() != MVT::Other) continue; + + SDNode *User = *UI; + + if (User->getOpcode() == ISD::HANDLENODE) // Root of the graph. + continue; + + // If we see an already-selected machine node, then we've gone beyond the + // pattern that we're selecting down into the already selected chunk of the + // DAG. + unsigned UserOpcode = User->getOpcode(); + if (User->isMachineOpcode() || + UserOpcode == ISD::CopyToReg || + UserOpcode == ISD::CopyFromReg || + UserOpcode == ISD::INLINEASM || + UserOpcode == ISD::EH_LABEL || + UserOpcode == ISD::LIFETIME_START || + UserOpcode == ISD::LIFETIME_END) { + // If their node ID got reset to -1 then they've already been selected. + // Treat them like a MachineOpcode. + if (User->getNodeId() == -1) + continue; + } + + // If we have a TokenFactor, we handle it specially. + if (User->getOpcode() != ISD::TokenFactor) { + // If the node isn't a token factor and isn't part of our pattern, then it + // must be a random chained node in between two nodes we're selecting. + // This happens when we have something like: + // x = load ptr + // call + // y = x+4 + // store y -> ptr + // Because we structurally match the load/store as a read/modify/write, + // but the call is chained between them. We cannot fold in this case + // because it would induce a cycle in the graph. + if (!std::count(ChainedNodesInPattern.begin(), + ChainedNodesInPattern.end(), User)) + return CR_InducesCycle; + + // Otherwise we found a node that is part of our pattern. For example in: + // x = load ptr + // y = x+4 + // store y -> ptr + // This would happen when we're scanning down from the load and see the + // store as a user. Record that there is a use of ChainedNode that is + // part of the pattern and keep scanning uses. + Result = CR_LeadsToInteriorNode; + InteriorChainedNodes.push_back(User); + continue; + } + + // If we found a TokenFactor, there are two cases to consider: first if the + // TokenFactor is just hanging "below" the pattern we're matching (i.e. no + // uses of the TF are in our pattern) we just want to ignore it. Second, + // the TokenFactor can be sandwiched in between two chained nodes, like so: + // [Load chain] + // ^ + // | + // [Load] + // ^ ^ + // | \ DAG's like cheese + // / \ do you? + // / | + // [TokenFactor] [Op] + // ^ ^ + // | | + // \ / + // \ / + // [Store] + // + // In this case, the TokenFactor becomes part of our match and we rewrite it + // as a new TokenFactor. + // + // To distinguish these two cases, do a recursive walk down the uses. + auto MemoizeResult = TokenFactorResult.find(User); + bool Visited = MemoizeResult != TokenFactorResult.end(); + // Recursively walk chain users only if the result is not memoized. + if (!Visited) { + auto Res = WalkChainUsers(User, ChainedNodesInPattern, TokenFactorResult, + InteriorChainedNodes); + MemoizeResult = TokenFactorResult.insert(std::make_pair(User, Res)).first; + } + switch (MemoizeResult->second) { + case CR_Simple: + // If the uses of the TokenFactor are just already-selected nodes, ignore + // it, it is "below" our pattern. + continue; + case CR_InducesCycle: + // If the uses of the TokenFactor lead to nodes that are not part of our + // pattern that are not selected, folding would turn this into a cycle, + // bail out now. + return CR_InducesCycle; + case CR_LeadsToInteriorNode: + break; // Otherwise, keep processing. + } + + // Okay, we know we're in the interesting interior case. The TokenFactor + // is now going to be considered part of the pattern so that we rewrite its + // uses (it may have uses that are not part of the pattern) with the + // ultimate chain result of the generated code. We will also add its chain + // inputs as inputs to the ultimate TokenFactor we create. + Result = CR_LeadsToInteriorNode; + if (!Visited) { + ChainedNodesInPattern.push_back(User); + InteriorChainedNodes.push_back(User); + } + } + + return Result; +} + /// HandleMergeInputChains - This implements the OPC_EmitMergeInputChains /// operation for when the pattern matched at least one node with a chains. The /// input vector contains a list of all of the chained nodes that we match. We @@ -2484,56 +2552,47 @@ void SelectionDAGISel::UpdateChains( static SDValue HandleMergeInputChains(SmallVectorImpl<SDNode*> &ChainNodesMatched, SelectionDAG *CurDAG) { + // Used for memoization. Without it WalkChainUsers could take exponential + // time to run. + DenseMap<const SDNode *, ChainResult> TokenFactorResult; + // Walk all of the chained nodes we've matched, recursively scanning down the + // users of the chain result. This adds any TokenFactor nodes that are caught + // in between chained nodes to the chained and interior nodes list. + SmallVector<SDNode*, 3> InteriorChainedNodes; + for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) { + if (WalkChainUsers(ChainNodesMatched[i], ChainNodesMatched, + TokenFactorResult, + InteriorChainedNodes) == CR_InducesCycle) + return SDValue(); // Would induce a cycle. + } - SmallPtrSet<const SDNode *, 16> Visited; - SmallVector<const SDNode *, 8> Worklist; + // Okay, we have walked all the matched nodes and collected TokenFactor nodes + // that we are interested in. Form our input TokenFactor node. SmallVector<SDValue, 3> InputChains; - unsigned int Max = 8192; - - // Quick exit on trivial merge. - if (ChainNodesMatched.size() == 1) - return ChainNodesMatched[0]->getOperand(0); + for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) { + // Add the input chain of this node to the InputChains list (which will be + // the operands of the generated TokenFactor) if it's not an interior node. + SDNode *N = ChainNodesMatched[i]; + if (N->getOpcode() != ISD::TokenFactor) { + if (std::count(InteriorChainedNodes.begin(),InteriorChainedNodes.end(),N)) + continue; - // Add chains that aren't already added (internal). Peek through - // token factors. - std::function<void(const SDValue)> AddChains = [&](const SDValue V) { - if (V.getValueType() != MVT::Other) - return; - if (V->getOpcode() == ISD::EntryToken) - return; - if (!Visited.insert(V.getNode()).second) - return; - if (V->getOpcode() == ISD::TokenFactor) { - for (const SDValue &Op : V->op_values()) - AddChains(Op); - } else - InputChains.push_back(V); - }; + // Otherwise, add the input chain. + SDValue InChain = ChainNodesMatched[i]->getOperand(0); + assert(InChain.getValueType() == MVT::Other && "Not a chain"); + InputChains.push_back(InChain); + continue; + } - for (auto *N : ChainNodesMatched) { - Worklist.push_back(N); - Visited.insert(N); + // If we have a token factor, we want to add all inputs of the token factor + // that are not part of the pattern we're matching. + for (const SDValue &Op : N->op_values()) { + if (!std::count(ChainNodesMatched.begin(), ChainNodesMatched.end(), + Op.getNode())) + InputChains.push_back(Op); + } } - while (!Worklist.empty()) - AddChains(Worklist.pop_back_val()->getOperand(0)); - - // Skip the search if there are no chain dependencies. - if (InputChains.size() == 0) - return CurDAG->getEntryNode(); - - // If one of these chains is a successor of input, we must have a - // node that is both the predecessor and successor of the - // to-be-merged nodes. Fail. - Visited.clear(); - for (SDValue V : InputChains) - Worklist.push_back(V.getNode()); - - for (auto *N : ChainNodesMatched) - if (SDNode::hasPredecessorHelper(N, Visited, Worklist, Max, true)) - return SDValue(); - - // Return merged chain. if (InputChains.size() == 1) return InputChains[0]; return CurDAG->getNode(ISD::TokenFactor, SDLoc(ChainNodesMatched[0]), @@ -2578,8 +2637,8 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList, // Move the glue if needed. if ((EmitNodeInfo & OPFL_GlueOutput) && OldGlueResultNo != -1 && (unsigned)OldGlueResultNo != ResNumResults-1) - ReplaceUses(SDValue(Node, OldGlueResultNo), - SDValue(Res, ResNumResults - 1)); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldGlueResultNo), + SDValue(Res, ResNumResults-1)); if ((EmitNodeInfo & OPFL_GlueOutput) != 0) --ResNumResults; @@ -2587,15 +2646,14 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList, // Move the chain reference if needed. if ((EmitNodeInfo & OPFL_Chain) && OldChainResultNo != -1 && (unsigned)OldChainResultNo != ResNumResults-1) - ReplaceUses(SDValue(Node, OldChainResultNo), - SDValue(Res, ResNumResults - 1)); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldChainResultNo), + SDValue(Res, ResNumResults-1)); // Otherwise, no replacement happened because the node already exists. Replace // Uses of the old node with the new one. if (Res != Node) { - ReplaceNode(Node, Res); - } else { - EnforceNodeIdInvariant(Res); + CurDAG->ReplaceAllUsesWith(Node, Res); + CurDAG->RemoveDeadNode(Node); } return Res; @@ -2912,7 +2970,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, return; case ISD::AssertSext: case ISD::AssertZext: - ReplaceUses(SDValue(NodeToMatch, 0), NodeToMatch->getOperand(0)); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, 0), + NodeToMatch->getOperand(0)); CurDAG->RemoveDeadNode(NodeToMatch); return; case ISD::INLINEASM: @@ -3670,7 +3729,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, NodeToMatch->getValueType(i).getSizeInBits() == Res.getValueSizeInBits()) && "invalid replacement"); - ReplaceUses(SDValue(NodeToMatch, i), Res); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, i), Res); } // Update chain uses. @@ -3683,8 +3742,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, if (NodeToMatch->getValueType(NodeToMatch->getNumValues() - 1) == MVT::Glue && InputGlue.getNode()) - ReplaceUses(SDValue(NodeToMatch, NodeToMatch->getNumValues() - 1), - InputGlue); + CurDAG->ReplaceAllUsesOfValueWith( + SDValue(NodeToMatch, NodeToMatch->getNumValues() - 1), InputGlue); assert(NodeToMatch->use_empty() && "Didn't replace all uses of the node?"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 6a47a5ba50c..90cd4ffaf0b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -766,11 +766,12 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { if (ProduceCarry) { // Replace the carry-use - ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1)); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(AddHi, 1)); } // Replace the remaining uses. - ReplaceNode(N, RegSequence); + CurDAG->ReplaceAllUsesWith(N, RegSequence); + CurDAG->RemoveDeadNode(N); } void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 0063303ac48..94fe84c8751 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -500,7 +500,7 @@ bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N, void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) { CurDAG->RepositionNode(N.getNode()->getIterator(), M.getNode()); - ReplaceUses(N, M); + CurDAG->ReplaceAllUsesWith(N, M); } bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N, diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index fa992490f5d..3540cf06b9c 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -662,7 +662,7 @@ void HexagonDAGToDAGISel::SelectBitcast(SDNode *N) { return; } - ReplaceUses(SDValue(N, 0), N->getOperand(0)); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N,0), N->getOperand(0)); CurDAG->RemoveDeadNode(N); } @@ -726,6 +726,7 @@ void HexagonDAGToDAGISel::SelectTypecast(SDNode *N) { SDNode *T = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(OpTy), {Op}); ReplaceNode(T, Op.getNode()); + CurDAG->RemoveDeadNode(T); } void HexagonDAGToDAGISel::SelectP2D(SDNode *N) { @@ -2184,3 +2185,4 @@ void HexagonDAGToDAGISel::rebalanceAddressTrees() { RootHeights.clear(); RootWeights.clear(); } + diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index 285197a909e..46f5bb4de8a 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -1953,6 +1953,7 @@ void HvxSelector::selectShuffle(SDNode *N) { // If the mask is all -1's, generate "undef". if (!UseLeft && !UseRight) { ISel.ReplaceNode(N, ISel.selectUndef(SDLoc(SN), ResTy).getNode()); + DAG.RemoveDeadNode(N); return; } @@ -2008,6 +2009,7 @@ void HvxSelector::selectRor(SDNode *N) { NewN = DAG.getMachineNode(Hexagon::V6_vror, dl, Ty, {VecV, RotV}); ISel.ReplaceNode(N, NewN); + DAG.RemoveDeadNode(N); } void HvxSelector::selectVAlign(SDNode *N) { @@ -2068,7 +2070,8 @@ void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) { MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1); - ReplaceNode(N, Result); + ReplaceUses(N, Result); + CurDAG->RemoveDeadNode(N); } void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) { @@ -2106,7 +2109,8 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) { MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1); - ReplaceNode(N, Result); + ReplaceUses(N, Result); + CurDAG->RemoveDeadNode(N); } void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) { @@ -2149,3 +2153,5 @@ void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) { ReplaceUses(SDValue(N, 1), SDValue(Result, 1)); CurDAG->RemoveDeadNode(N); } + + diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index 6e2130828bb..9bf2474915c 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -596,13 +596,7 @@ static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) { if (N.getNode()->getNodeId() == -1 || N.getNode()->getNodeId() > Pos->getNodeId()) { DAG->RepositionNode(Pos->getIterator(), N.getNode()); - // Mark Node as invalid for pruning as after this it may be a successor to a - // selected node but otherwise be in the same position of Pos. - // Conservatively mark it with the same -abs(Id) to assure node id - // invariant is preserved. - int PId = Pos->getNodeId(); - int InvalidatedPId = -(PId + 1); - N->setNodeId((PId > 0) ? InvalidatedPId : PId); + N.getNode()->setNodeId(Pos->getNodeId()); } } @@ -1033,7 +1027,8 @@ bool SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) { }; SDValue New = convertTo( DL, VT, SDValue(CurDAG->getMachineNode(Opcode, DL, OpcodeVT, Ops), 0)); - ReplaceNode(N, New.getNode()); + ReplaceUses(N, New.getNode()); + CurDAG->RemoveDeadNode(N); return true; } @@ -1124,7 +1119,8 @@ void SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Lower = CurDAG->getConstant(LowerVal, DL, VT); SDValue Or = CurDAG->getNode(Opcode, DL, VT, Upper, Lower); - ReplaceNode(Node, Or.getNode()); + ReplaceUses(Node, Or.getNode()); + CurDAG->RemoveDeadNode(Node); SelectCode(Or.getNode()); } @@ -1622,3 +1618,4 @@ void SystemZDAGToDAGISel::PreprocessISelDAG() { if (MadeChange) CurDAG->RemoveDeadNodes(); } + diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index ebcd15230b9..bcac241bb3a 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2109,86 +2109,52 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, LoadNode->getOffset() != StoreNode->getOffset()) return false; - bool FoundLoad = false; - SmallVector<SDValue, 4> ChainOps; - SmallVector<const SDNode *, 4> LoopWorklist; - SmallPtrSet<const SDNode *, 16> Visited; - const unsigned int Max = 1024; - - // Visualization of Load-Op-Store fusion: - // ------------------------- - // Legend: - // *-lines = Chain operand dependencies. - // |-lines = Normal operand dependencies. - // Dependencies flow down and right. n-suffix references multiple nodes. - // - // C Xn C - // * * * - // * * * - // Xn A-LD Yn TF Yn - // * * \ | * | - // * * \ | * | - // * * \ | => A--LD_OP_ST - // * * \| \ - // TF OP \ - // * | \ Zn - // * | \ - // A-ST Zn - // - - // This merge induced dependences from: #1: Xn -> LD, OP, Zn - // #2: Yn -> LD - // #3: ST -> Zn - - // Ensure the transform is safe by checking for the dual - // dependencies to make sure we do not induce a loop. - - // As LD is a predecessor to both OP and ST we can do this by checking: - // a). if LD is a predecessor to a member of Xn or Yn. - // b). if a Zn is a predecessor to ST. - - // However, (b) can only occur through being a chain predecessor to - // ST, which is the same as Zn being a member or predecessor of Xn, - // which is a subset of LD being a predecessor of Xn. So it's - // subsumed by check (a). - + // Check if the chain is produced by the load or is a TokenFactor with + // the load output chain as an operand. Return InputChain by reference. SDValue Chain = StoreNode->getChain(); - // Gather X elements in ChainOps. + bool ChainCheck = false; if (Chain == Load.getValue(1)) { - FoundLoad = true; - ChainOps.push_back(Load.getOperand(0)); + ChainCheck = true; + InputChain = LoadNode->getChain(); } else if (Chain.getOpcode() == ISD::TokenFactor) { + SmallVector<SDValue, 4> ChainOps; for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { SDValue Op = Chain.getOperand(i); if (Op == Load.getValue(1)) { - FoundLoad = true; + ChainCheck = true; // Drop Load, but keep its chain. No cycle check necessary. ChainOps.push_back(Load.getOperand(0)); continue; } - LoopWorklist.push_back(Op.getNode()); - ChainOps.push_back(Op); - } - } - if (!FoundLoad) - return false; + // Make sure using Op as part of the chain would not cause a cycle here. + // In theory, we could check whether the chain node is a predecessor of + // the load. But that can be very expensive. Instead visit the uses and + // make sure they all have smaller node id than the load. + int LoadId = LoadNode->getNodeId(); + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = UI->use_end(); UI != UE; ++UI) { + if (UI.getUse().getResNo() != 0) + continue; + if (UI->getNodeId() > LoadId) + return false; + } - // Worklist is currently Xn. Add Yn to worklist. - for (SDValue Op : StoredVal->ops()) - if (Op.getNode() != LoadNode) - LoopWorklist.push_back(Op.getNode()); + ChainOps.push_back(Op); + } - // Check (a) if Load is a predecessor to Xn + Yn - if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max, - true)) + if (ChainCheck) + // Make a new TokenFactor with all the other input chains except + // for the load. + InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), + MVT::Other, ChainOps); + } + if (!ChainCheck) return false; - InputChain = - CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); return true; - } +} // Change a chain of {load; op; store} of the same value into a simple op // through memory of that value, if the uses of the modified value and its @@ -2417,8 +2383,6 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { MemOp[1] = LoadNode->getMemOperand(); Result->setMemRefs(MemOp, MemOp + 2); - // Update Load Chain uses as well. - ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1)); ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); CurDAG->RemoveDeadNode(Node); @@ -3130,7 +3094,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // Emit a testl or testw. SDNode *NewNode = CurDAG->getMachineNode(Op, dl, MVT::i32, Reg, Imm); // Replace CMP with TEST. - ReplaceNode(Node, NewNode); + CurDAG->ReplaceAllUsesWith(Node, NewNode); + CurDAG->RemoveDeadNode(Node); return; } break; diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index 05af49ff959..e4c59631436 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -90,12 +90,12 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind { define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v32i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: pavgb (%rdi), %xmm1 +; SSE2-NEXT: pavgb 16(%rsi), %xmm0 ; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i8: @@ -545,18 +545,18 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v64i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 +; SSE2-NEXT: movdqa 32(%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 -; SSE2-NEXT: pavgb 32(%rdi), %xmm2 +; SSE2-NEXT: pavgb (%rdi), %xmm1 +; SSE2-NEXT: pavgb 16(%rdi), %xmm2 +; SSE2-NEXT: pavgb 32(%rsi), %xmm0 ; SSE2-NEXT: pavgb 48(%rdi), %xmm3 ; SSE2-NEXT: movdqu %xmm3, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v64i8: @@ -582,23 +582,23 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind { ; ; AVX2-LABEL: avg_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-NEXT: vpavgb (%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-NEXT: vpavgb (%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -678,12 +678,12 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind { define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pavgw (%rdi), %xmm0 -; SSE2-NEXT: pavgw 16(%rdi), %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: pavgw (%rdi), %xmm1 +; SSE2-NEXT: pavgw 16(%rsi), %xmm0 ; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v16i16: @@ -729,18 +729,18 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind { define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v32i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 +; SSE2-NEXT: movdqa 32(%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pavgw (%rdi), %xmm0 -; SSE2-NEXT: pavgw 16(%rdi), %xmm1 -; SSE2-NEXT: pavgw 32(%rdi), %xmm2 +; SSE2-NEXT: pavgw (%rdi), %xmm1 +; SSE2-NEXT: pavgw 16(%rdi), %xmm2 +; SSE2-NEXT: pavgw 32(%rsi), %xmm0 ; SSE2-NEXT: pavgw 48(%rdi), %xmm3 ; SSE2-NEXT: movdqu %xmm3, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i16: @@ -766,23 +766,23 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; ; AVX2-LABEL: avg_v32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-NEXT: vpavgw (%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpavgw 32(%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-NEXT: vpavgw (%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vpavgw 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -891,9 +891,9 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v32i8_2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pavgb (%rsi), %xmm0 -; SSE2-NEXT: pavgb 16(%rsi), %xmm1 +; SSE2-NEXT: pavgb 16(%rdi), %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -1072,9 +1072,9 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v16i16_2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pavgw (%rsi), %xmm0 -; SSE2-NEXT: pavgw 16(%rsi), %xmm1 +; SSE2-NEXT: pavgw 16(%rdi), %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -1124,14 +1124,14 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: pavgw (%rsi), %xmm0 ; SSE2-NEXT: pavgw 16(%rsi), %xmm1 -; SSE2-NEXT: pavgw 32(%rsi), %xmm2 -; SSE2-NEXT: pavgw 48(%rsi), %xmm3 -; SSE2-NEXT: movdqu %xmm3, (%rax) +; SSE2-NEXT: pavgw 32(%rdi), %xmm3 +; SSE2-NEXT: pavgw 48(%rsi), %xmm2 ; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -1160,9 +1160,9 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; AVX2-LABEL: avg_v32i16_2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -1171,9 +1171,9 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; AVX512F-LABEL: avg_v32i16_2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll b/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll index b5026437153..7fdbf31a993 100644 --- a/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll @@ -235,16 +235,18 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovaps (%ecx), %xmm0 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-NEXT: vmovaps %ymm1, (%eax) +; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: PR29088: ; X64: # %bb.0: +; X64-NEXT: vmovaps (%rdi), %xmm0 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-NEXT: vmovaps %ymm1, (%rsi) +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %ld = load <4 x i32>, <4 x i32>* %p0 store <8 x float> zeroinitializer, <8 x float>* %p1 diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll index 3ae6c0b9d81..528dfcd6f8d 100644 --- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll @@ -1065,7 +1065,9 @@ define void @isel_crash_16b(i8* %cV_R.addr) { ; X64: ## %bb.0: ## %eintry ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vpbroadcastb (%rdi), %xmm1 +; X64-NEXT: movb (%rdi), %al +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: vpbroadcastb %xmm1, %xmm1 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq @@ -1116,7 +1118,9 @@ define void @isel_crash_32b(i8* %cV_R.addr) { ; X64-NEXT: subq $128, %rsp ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vmovaps %ymm0, (%rsp) -; X64-NEXT: vpbroadcastb (%rdi), %ymm1 +; X64-NEXT: movb (%rdi), %al +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: vpbroadcastb %xmm1, %ymm1 ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; X64-NEXT: movq %rbp, %rsp @@ -1156,7 +1160,9 @@ define void @isel_crash_8w(i16* %cV_R.addr) { ; X64: ## %bb.0: ## %entry ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vpbroadcastw (%rdi), %xmm1 +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: vpbroadcastw %xmm1, %xmm1 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq @@ -1207,7 +1213,9 @@ define void @isel_crash_16w(i16* %cV_R.addr) { ; X64-NEXT: subq $128, %rsp ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vmovaps %ymm0, (%rsp) -; X64-NEXT: vpbroadcastw (%rdi), %ymm1 +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: vpbroadcastw %xmm1, %ymm1 ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; X64-NEXT: movq %rbp, %rsp @@ -1243,14 +1251,26 @@ define void @isel_crash_4d(i32* %cV_R.addr) { ; X32-NEXT: addl $60, %esp ; X32-NEXT: retl ; -; X64-LABEL: isel_crash_4d: -; X64: ## %bb.0: ## %entry -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vbroadcastss (%rdi), %xmm1 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: retq +; X64-AVX2-LABEL: isel_crash_4d: +; X64-AVX2: ## %bb.0: ## %entry +; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: movl (%rdi), %eax +; X64-AVX2-NEXT: vmovd %eax, %xmm1 +; X64-AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: retq +; +; X64-AVX512VL-LABEL: isel_crash_4d: +; X64-AVX512VL: ## %bb.0: ## %entry +; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: movl (%rdi), %eax +; X64-AVX512VL-NEXT: vpbroadcastd %eax, %xmm1 +; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: retq entry: %__a.addr.i = alloca <2 x i64>, align 16 %__b.addr.i = alloca <2 x i64>, align 16 @@ -1287,24 +1307,46 @@ define void @isel_crash_8d(i32* %cV_R.addr) { ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; -; X64-LABEL: isel_crash_8d: -; X64: ## %bb.0: ## %eintry -; X64-NEXT: pushq %rbp -; X64-NEXT: .cfi_def_cfa_offset 16 -; X64-NEXT: .cfi_offset %rbp, -16 -; X64-NEXT: movq %rsp, %rbp -; X64-NEXT: .cfi_def_cfa_register %rbp -; X64-NEXT: andq $-32, %rsp -; X64-NEXT: subq $128, %rsp -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %ymm0, (%rsp) -; X64-NEXT: vbroadcastss (%rdi), %ymm1 -; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; X64-NEXT: movq %rbp, %rsp -; X64-NEXT: popq %rbp -; X64-NEXT: vzeroupper -; X64-NEXT: retq +; X64-AVX2-LABEL: isel_crash_8d: +; X64-AVX2: ## %bb.0: ## %eintry +; X64-AVX2-NEXT: pushq %rbp +; X64-AVX2-NEXT: .cfi_def_cfa_offset 16 +; X64-AVX2-NEXT: .cfi_offset %rbp, -16 +; X64-AVX2-NEXT: movq %rsp, %rbp +; X64-AVX2-NEXT: .cfi_def_cfa_register %rbp +; X64-AVX2-NEXT: andq $-32, %rsp +; X64-AVX2-NEXT: subq $128, %rsp +; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp) +; X64-AVX2-NEXT: movl (%rdi), %eax +; X64-AVX2-NEXT: vmovd %eax, %xmm1 +; X64-AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 +; X64-AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: movq %rbp, %rsp +; X64-AVX2-NEXT: popq %rbp +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512VL-LABEL: isel_crash_8d: +; X64-AVX512VL: ## %bb.0: ## %eintry +; X64-AVX512VL-NEXT: pushq %rbp +; X64-AVX512VL-NEXT: .cfi_def_cfa_offset 16 +; X64-AVX512VL-NEXT: .cfi_offset %rbp, -16 +; X64-AVX512VL-NEXT: movq %rsp, %rbp +; X64-AVX512VL-NEXT: .cfi_def_cfa_register %rbp +; X64-AVX512VL-NEXT: andq $-32, %rsp +; X64-AVX512VL-NEXT: subq $128, %rsp +; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX512VL-NEXT: vmovaps %ymm0, (%rsp) +; X64-AVX512VL-NEXT: movl (%rdi), %eax +; X64-AVX512VL-NEXT: vpbroadcastd %eax, %ymm1 +; X64-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: movq %rbp, %rsp +; X64-AVX512VL-NEXT: popq %rbp +; X64-AVX512VL-NEXT: vzeroupper +; X64-AVX512VL-NEXT: retq eintry: %__a.addr.i = alloca <4 x i64>, align 16 %__b.addr.i = alloca <4 x i64>, align 16 @@ -1328,20 +1370,33 @@ define void @isel_crash_2q(i64* %cV_R.addr) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X32-NEXT: vmovaps %xmm0, (%esp) -; X32-NEXT: vpbroadcastq (%eax), %xmm1 +; X32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: vpbroadcastq %xmm1, %xmm1 ; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp ; X32-NEXT: retl ; -; X64-LABEL: isel_crash_2q: -; X64: ## %bb.0: ## %entry -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vpbroadcastq (%rdi), %xmm1 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: retq +; X64-AVX2-LABEL: isel_crash_2q: +; X64-AVX2: ## %bb.0: ## %entry +; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: movq (%rdi), %rax +; X64-AVX2-NEXT: vmovq %rax, %xmm1 +; X64-AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: retq +; +; X64-AVX512VL-LABEL: isel_crash_2q: +; X64-AVX512VL: ## %bb.0: ## %entry +; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: movq (%rdi), %rax +; X64-AVX512VL-NEXT: vpbroadcastq %rax, %xmm1 +; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: retq entry: %__a.addr.i = alloca <2 x i64>, align 16 %__b.addr.i = alloca <2 x i64>, align 16 @@ -1378,24 +1433,46 @@ define void @isel_crash_4q(i64* %cV_R.addr) { ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; -; X64-LABEL: isel_crash_4q: -; X64: ## %bb.0: ## %eintry -; X64-NEXT: pushq %rbp -; X64-NEXT: .cfi_def_cfa_offset 16 -; X64-NEXT: .cfi_offset %rbp, -16 -; X64-NEXT: movq %rsp, %rbp -; X64-NEXT: .cfi_def_cfa_register %rbp -; X64-NEXT: andq $-32, %rsp -; X64-NEXT: subq $128, %rsp -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %ymm0, (%rsp) -; X64-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; X64-NEXT: movq %rbp, %rsp -; X64-NEXT: popq %rbp -; X64-NEXT: vzeroupper -; X64-NEXT: retq +; X64-AVX2-LABEL: isel_crash_4q: +; X64-AVX2: ## %bb.0: ## %eintry +; X64-AVX2-NEXT: pushq %rbp +; X64-AVX2-NEXT: .cfi_def_cfa_offset 16 +; X64-AVX2-NEXT: .cfi_offset %rbp, -16 +; X64-AVX2-NEXT: movq %rsp, %rbp +; X64-AVX2-NEXT: .cfi_def_cfa_register %rbp +; X64-AVX2-NEXT: andq $-32, %rsp +; X64-AVX2-NEXT: subq $128, %rsp +; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp) +; X64-AVX2-NEXT: movq (%rdi), %rax +; X64-AVX2-NEXT: vmovq %rax, %xmm1 +; X64-AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; X64-AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: movq %rbp, %rsp +; X64-AVX2-NEXT: popq %rbp +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512VL-LABEL: isel_crash_4q: +; X64-AVX512VL: ## %bb.0: ## %eintry +; X64-AVX512VL-NEXT: pushq %rbp +; X64-AVX512VL-NEXT: .cfi_def_cfa_offset 16 +; X64-AVX512VL-NEXT: .cfi_offset %rbp, -16 +; X64-AVX512VL-NEXT: movq %rsp, %rbp +; X64-AVX512VL-NEXT: .cfi_def_cfa_register %rbp +; X64-AVX512VL-NEXT: andq $-32, %rsp +; X64-AVX512VL-NEXT: subq $128, %rsp +; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX512VL-NEXT: vmovaps %ymm0, (%rsp) +; X64-AVX512VL-NEXT: movq (%rdi), %rax +; X64-AVX512VL-NEXT: vpbroadcastq %rax, %ymm1 +; X64-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: movq %rbp, %rsp +; X64-AVX512VL-NEXT: popq %rbp +; X64-AVX512VL-NEXT: vzeroupper +; X64-AVX512VL-NEXT: retq eintry: %__a.addr.i = alloca <4 x i64>, align 16 %__b.addr.i = alloca <4 x i64>, align 16 diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll b/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll index 996e6796616..254cdfdd8cb 100644 --- a/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll @@ -271,16 +271,18 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovaps (%ecx), %xmm0 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-NEXT: vmovaps %ymm1, (%eax) +; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: PR29088: ; X64: # %bb.0: +; X64-NEXT: vmovaps (%rdi), %xmm0 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-NEXT: vmovaps %ymm1, (%rsi) +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %ld = load <4 x i32>, <4 x i32>* %p0 store <8 x float> zeroinitializer, <8 x float>* %p1 diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll b/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll index 2bf69cfadcf..c5ecb1559b4 100644 --- a/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll @@ -186,23 +186,26 @@ define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind { define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { ; X64-AVX512VL-LABEL: PR29088: ; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512VL-NEXT: vmovdqa %ymm1, (%rsi) +; X64-AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: PR29088: ; X64-AVX512BWVL: ## %bb.0: +; X64-AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-AVX512BWVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi) +; X64-AVX512BWVL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BWVL-NEXT: retq ; ; X64-AVX512DQVL-LABEL: PR29088: ; X64-AVX512DQVL: ## %bb.0: +; X64-AVX512DQVL-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512DQVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512DQVL-NEXT: vmovaps %ymm1, (%rsi) +; X64-AVX512DQVL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512DQVL-NEXT: retq %ld = load <4 x i32>, <4 x i32>* %p0 store <8 x float> zeroinitializer, <8 x float>* %p1 diff --git a/llvm/test/CodeGen/X86/i256-add.ll b/llvm/test/CodeGen/X86/i256-add.ll index 85a885a4315..36d838a68cb 100644 --- a/llvm/test/CodeGen/X86/i256-add.ll +++ b/llvm/test/CodeGen/X86/i256-add.ll @@ -9,30 +9,40 @@ define void @add(i256* %p, i256* %q) nounwind { ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: subl $8, %esp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 28(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 24(%eax), %ecx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X32-NEXT: movl 20(%eax), %esi -; X32-NEXT: movl 16(%eax), %edi -; X32-NEXT: movl 12(%eax), %ebx -; X32-NEXT: movl 8(%eax), %ebp -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl 4(%eax), %edx +; X32-NEXT: subl $12, %esp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: addl %ecx, (%eax) -; X32-NEXT: adcl %edx, 4(%eax) -; X32-NEXT: adcl %ebp, 8(%eax) -; X32-NEXT: adcl %ebx, 12(%eax) -; X32-NEXT: adcl %edi, 16(%eax) -; X32-NEXT: adcl %esi, 20(%eax) -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, 24(%eax) -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, 28(%eax) -; X32-NEXT: addl $8, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 8(%ecx), %edi +; X32-NEXT: movl (%ecx), %edx +; X32-NEXT: movl 4(%ecx), %ebx +; X32-NEXT: movl 28(%eax), %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill +; X32-NEXT: movl 24(%eax), %ebp +; X32-NEXT: addl (%eax), %edx +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill +; X32-NEXT: adcl 4(%eax), %ebx +; X32-NEXT: adcl 8(%eax), %edi +; X32-NEXT: movl %edi, (%esp) # 4-byte Spill +; X32-NEXT: movl 20(%eax), %edi +; X32-NEXT: movl 12(%eax), %edx +; X32-NEXT: movl 16(%eax), %esi +; X32-NEXT: adcl 12(%ecx), %edx +; X32-NEXT: adcl 16(%ecx), %esi +; X32-NEXT: adcl 20(%ecx), %edi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: adcl 24(%ecx), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload +; X32-NEXT: adcl %ebp, 28(%ecx) +; X32-NEXT: movl (%esp), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, 8(%ecx) +; X32-NEXT: movl %ebx, 4(%ecx) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, (%ecx) +; X32-NEXT: movl %edx, 12(%ecx) +; X32-NEXT: movl %esi, 16(%ecx) +; X32-NEXT: movl %edi, 20(%ecx) +; X32-NEXT: movl %eax, 24(%ecx) +; X32-NEXT: addl $12, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -41,14 +51,17 @@ define void @add(i256* %p, i256* %q) nounwind { ; ; X64-LABEL: add: ; X64: # %bb.0: -; X64-NEXT: movq 24(%rsi), %rax -; X64-NEXT: movq 16(%rsi), %rcx -; X64-NEXT: movq (%rsi), %rdx -; X64-NEXT: movq 8(%rsi), %rsi -; X64-NEXT: addq %rdx, (%rdi) -; X64-NEXT: adcq %rsi, 8(%rdi) -; X64-NEXT: adcq %rcx, 16(%rdi) -; X64-NEXT: adcq %rax, 24(%rdi) +; X64-NEXT: movq 16(%rdi), %rax +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq 8(%rdi), %rdx +; X64-NEXT: movq 24(%rsi), %r8 +; X64-NEXT: addq (%rsi), %rcx +; X64-NEXT: adcq 8(%rsi), %rdx +; X64-NEXT: adcq 16(%rsi), %rax +; X64-NEXT: adcq %r8, 24(%rdi) +; X64-NEXT: movq %rax, 16(%rdi) +; X64-NEXT: movq %rdx, 8(%rdi) +; X64-NEXT: movq %rcx, (%rdi) ; X64-NEXT: retq %a = load i256, i256* %p %b = load i256, i256* %q @@ -64,28 +77,35 @@ define void @sub(i256* %p, i256* %q) nounwind { ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: subl $8, %esp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 28(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 24(%eax), %ecx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X32-NEXT: movl 20(%eax), %esi -; X32-NEXT: movl 16(%eax), %edi -; X32-NEXT: movl 12(%eax), %ebx -; X32-NEXT: movl 8(%eax), %ebp -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl 4(%eax), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: subl %ecx, (%eax) -; X32-NEXT: sbbl %edx, 4(%eax) -; X32-NEXT: sbbl %ebp, 8(%eax) -; X32-NEXT: sbbl %ebx, 12(%eax) -; X32-NEXT: sbbl %edi, 16(%eax) -; X32-NEXT: sbbl %esi, 20(%eax) -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: sbbl %ecx, 24(%eax) -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload -; X32-NEXT: sbbl %ecx, 28(%eax) +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 16(%ecx), %eax +; X32-NEXT: movl 12(%ecx), %edx +; X32-NEXT: movl 8(%ecx), %edi +; X32-NEXT: movl (%ecx), %ebx +; X32-NEXT: movl 4(%ecx), %ebp +; X32-NEXT: subl (%esi), %ebx +; X32-NEXT: sbbl 4(%esi), %ebp +; X32-NEXT: sbbl 8(%esi), %edi +; X32-NEXT: sbbl 12(%esi), %edx +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill +; X32-NEXT: sbbl 16(%esi), %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl 20(%ecx), %edx +; X32-NEXT: sbbl 20(%esi), %edx +; X32-NEXT: movl 24(%ecx), %eax +; X32-NEXT: sbbl 24(%esi), %eax +; X32-NEXT: movl 28(%esi), %esi +; X32-NEXT: sbbl %esi, 28(%ecx) +; X32-NEXT: movl %edi, 8(%ecx) +; X32-NEXT: movl %ebp, 4(%ecx) +; X32-NEXT: movl %ebx, (%ecx) +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload +; X32-NEXT: movl %esi, 12(%ecx) +; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: movl %esi, 16(%ecx) +; X32-NEXT: movl %edx, 20(%ecx) +; X32-NEXT: movl %eax, 24(%ecx) ; X32-NEXT: addl $8, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi @@ -95,14 +115,17 @@ define void @sub(i256* %p, i256* %q) nounwind { ; ; X64-LABEL: sub: ; X64: # %bb.0: -; X64-NEXT: movq 24(%rsi), %rax -; X64-NEXT: movq 16(%rsi), %rcx -; X64-NEXT: movq (%rsi), %rdx -; X64-NEXT: movq 8(%rsi), %rsi -; X64-NEXT: subq %rdx, (%rdi) -; X64-NEXT: sbbq %rsi, 8(%rdi) -; X64-NEXT: sbbq %rcx, 16(%rdi) -; X64-NEXT: sbbq %rax, 24(%rdi) +; X64-NEXT: movq 16(%rdi), %rax +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq 8(%rdi), %rdx +; X64-NEXT: movq 24(%rsi), %r8 +; X64-NEXT: subq (%rsi), %rcx +; X64-NEXT: sbbq 8(%rsi), %rdx +; X64-NEXT: sbbq 16(%rsi), %rax +; X64-NEXT: sbbq %r8, 24(%rdi) +; X64-NEXT: movq %rax, 16(%rdi) +; X64-NEXT: movq %rdx, 8(%rdi) +; X64-NEXT: movq %rcx, (%rdi) ; X64-NEXT: retq %a = load i256, i256* %p %b = load i256, i256* %q diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_memop.ll index aa6ae096445..4a250205051 100644 --- a/llvm/test/CodeGen/X86/masked_memop.ll +++ b/llvm/test/CodeGen/X86/masked_memop.ll @@ -1264,7 +1264,8 @@ define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %v ; AVX-LABEL: load_one_mask_bit_set5: ; AVX: ## %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores.ll index 4f511ef99e5..af5fb478e52 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-stores.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-stores.ll @@ -10,11 +10,12 @@ define i32 @foo (i64* %so) nounwind uwtable ssp { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl $0, 28(%eax) ; CHECK-NEXT: movl $0, 24(%eax) -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: cmpl 16(%eax), %ecx -; CHECK-NEXT: movl $0, 16(%eax) -; CHECK-NEXT: sbbl 20(%eax), %ecx +; CHECK-NEXT: movl 20(%eax), %ecx ; CHECK-NEXT: movl $0, 20(%eax) +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: cmpl 16(%eax), %edx +; CHECK-NEXT: movl $0, 16(%eax) +; CHECK-NEXT: sbbl %ecx, %edx ; CHECK-NEXT: setl %al ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: negl %eax diff --git a/llvm/test/CodeGen/X86/nontemporal.ll b/llvm/test/CodeGen/X86/nontemporal.ll index 472c3e4774c..f53982a8542 100644 --- a/llvm/test/CodeGen/X86/nontemporal.ll +++ b/llvm/test/CodeGen/X86/nontemporal.ll @@ -13,35 +13,36 @@ define i32 @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 ; X32-SSE-NEXT: andl $-16, %esp ; X32-SSE-NEXT: subl $16, %esp ; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; X32-SSE-NEXT: movl 12(%ebp), %ecx +; X32-SSE-NEXT: movl 12(%ebp), %eax ; X32-SSE-NEXT: movdqa 56(%ebp), %xmm4 ; X32-SSE-NEXT: movdqa 40(%ebp), %xmm5 ; X32-SSE-NEXT: movdqa 24(%ebp), %xmm6 -; X32-SSE-NEXT: movl 8(%ebp), %esi -; X32-SSE-NEXT: movl 80(%ebp), %edx -; X32-SSE-NEXT: movl (%edx), %eax +; X32-SSE-NEXT: movl 8(%ebp), %edx +; X32-SSE-NEXT: movl 80(%ebp), %ecx +; X32-SSE-NEXT: movl (%ecx), %esi ; X32-SSE-NEXT: addps {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: movntps %xmm0, (%esi) +; X32-SSE-NEXT: movntps %xmm0, (%edx) ; X32-SSE-NEXT: paddq {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: addl (%edx), %eax -; X32-SSE-NEXT: movntdq %xmm2, (%esi) +; X32-SSE-NEXT: addl (%ecx), %esi +; X32-SSE-NEXT: movntdq %xmm2, (%edx) ; X32-SSE-NEXT: addpd {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: addl (%edx), %eax -; X32-SSE-NEXT: movntpd %xmm1, (%esi) +; X32-SSE-NEXT: addl (%ecx), %esi +; X32-SSE-NEXT: movntpd %xmm1, (%edx) ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm6 -; X32-SSE-NEXT: addl (%edx), %eax -; X32-SSE-NEXT: movntdq %xmm6, (%esi) +; X32-SSE-NEXT: addl (%ecx), %esi +; X32-SSE-NEXT: movntdq %xmm6, (%edx) ; X32-SSE-NEXT: paddw {{\.LCPI.*}}, %xmm5 -; X32-SSE-NEXT: addl (%edx), %eax -; X32-SSE-NEXT: movntdq %xmm5, (%esi) +; X32-SSE-NEXT: addl (%ecx), %esi +; X32-SSE-NEXT: movntdq %xmm5, (%edx) ; X32-SSE-NEXT: paddb {{\.LCPI.*}}, %xmm4 -; X32-SSE-NEXT: addl (%edx), %eax -; X32-SSE-NEXT: movntdq %xmm4, (%esi) -; X32-SSE-NEXT: addl (%edx), %eax -; X32-SSE-NEXT: movntil %ecx, (%esi) -; X32-SSE-NEXT: addl (%edx), %eax -; X32-SSE-NEXT: movsd %xmm3, (%esi) -; X32-SSE-NEXT: addl (%edx), %eax +; X32-SSE-NEXT: addl (%ecx), %esi +; X32-SSE-NEXT: movntdq %xmm4, (%edx) +; X32-SSE-NEXT: addl (%ecx), %esi +; X32-SSE-NEXT: movntil %eax, (%edx) +; X32-SSE-NEXT: movl (%ecx), %eax +; X32-SSE-NEXT: addl %esi, %eax +; X32-SSE-NEXT: movsd %xmm3, (%edx) +; X32-SSE-NEXT: addl (%ecx), %eax ; X32-SSE-NEXT: leal -4(%ebp), %esp ; X32-SSE-NEXT: popl %esi ; X32-SSE-NEXT: popl %ebp @@ -55,35 +56,36 @@ define i32 @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 ; X32-AVX-NEXT: andl $-16, %esp ; X32-AVX-NEXT: subl $16, %esp ; X32-AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; X32-AVX-NEXT: movl 12(%ebp), %ecx +; X32-AVX-NEXT: movl 12(%ebp), %eax ; X32-AVX-NEXT: vmovdqa 56(%ebp), %xmm4 ; X32-AVX-NEXT: vmovdqa 40(%ebp), %xmm5 ; X32-AVX-NEXT: vmovdqa 24(%ebp), %xmm6 -; X32-AVX-NEXT: movl 8(%ebp), %edx -; X32-AVX-NEXT: movl 80(%ebp), %esi -; X32-AVX-NEXT: movl (%esi), %eax +; X32-AVX-NEXT: movl 8(%ebp), %ecx +; X32-AVX-NEXT: movl 80(%ebp), %edx +; X32-AVX-NEXT: movl (%edx), %esi ; X32-AVX-NEXT: vaddps {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-AVX-NEXT: vmovntps %xmm0, (%edx) +; X32-AVX-NEXT: vmovntps %xmm0, (%ecx) ; X32-AVX-NEXT: vpaddq {{\.LCPI.*}}, %xmm2, %xmm0 -; X32-AVX-NEXT: addl (%esi), %eax -; X32-AVX-NEXT: vmovntdq %xmm0, (%edx) +; X32-AVX-NEXT: addl (%edx), %esi +; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx) ; X32-AVX-NEXT: vaddpd {{\.LCPI.*}}, %xmm1, %xmm0 -; X32-AVX-NEXT: addl (%esi), %eax -; X32-AVX-NEXT: vmovntpd %xmm0, (%edx) +; X32-AVX-NEXT: addl (%edx), %esi +; X32-AVX-NEXT: vmovntpd %xmm0, (%ecx) ; X32-AVX-NEXT: vpaddd {{\.LCPI.*}}, %xmm6, %xmm0 -; X32-AVX-NEXT: addl (%esi), %eax -; X32-AVX-NEXT: vmovntdq %xmm0, (%edx) +; X32-AVX-NEXT: addl (%edx), %esi +; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx) ; X32-AVX-NEXT: vpaddw {{\.LCPI.*}}, %xmm5, %xmm0 -; X32-AVX-NEXT: addl (%esi), %eax -; X32-AVX-NEXT: vmovntdq %xmm0, (%edx) +; X32-AVX-NEXT: addl (%edx), %esi +; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx) ; X32-AVX-NEXT: vpaddb {{\.LCPI.*}}, %xmm4, %xmm0 -; X32-AVX-NEXT: addl (%esi), %eax -; X32-AVX-NEXT: vmovntdq %xmm0, (%edx) -; X32-AVX-NEXT: addl (%esi), %eax -; X32-AVX-NEXT: movntil %ecx, (%edx) -; X32-AVX-NEXT: addl (%esi), %eax -; X32-AVX-NEXT: vmovsd %xmm3, (%edx) -; X32-AVX-NEXT: addl (%esi), %eax +; X32-AVX-NEXT: addl (%edx), %esi +; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx) +; X32-AVX-NEXT: addl (%edx), %esi +; X32-AVX-NEXT: movntil %eax, (%ecx) +; X32-AVX-NEXT: movl (%edx), %eax +; X32-AVX-NEXT: addl %esi, %eax +; X32-AVX-NEXT: vmovsd %xmm3, (%ecx) +; X32-AVX-NEXT: addl (%edx), %eax ; X32-AVX-NEXT: leal -4(%ebp), %esp ; X32-AVX-NEXT: popl %esi ; X32-AVX-NEXT: popl %ebp diff --git a/llvm/test/CodeGen/X86/pr36274.ll b/llvm/test/CodeGen/X86/pr36274.ll deleted file mode 100644 index 97b958c6b68..00000000000 --- a/llvm/test/CodeGen/X86/pr36274.ll +++ /dev/null @@ -1,33 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s - -; This tests is checking for a case where the x86 load-op-store fusion -; misses a dependence between the fused load and a non-fused operand -; to the load causing a cycle. Here the dependence in question comes -; from the carry in input of the adcl. - -@vx = external local_unnamed_addr global <2 x i32>, align 8 - -define void @pr36274(i32* %somewhere) { -; CHECK-LABEL: pr36274: -; CHECK: # %bb.0: -; CHECK-NEXT: movl vx+4, %eax -; CHECK-NEXT: addl $1, vx -; CHECK-NEXT: adcl $0, %eax -; CHECK-NEXT: movl %eax, vx+4 -; CHECK-NEXT: retl - %a0 = getelementptr <2 x i32>, <2 x i32>* @vx, i32 0, i32 0 - %a1 = getelementptr <2 x i32>, <2 x i32>* @vx, i32 0, i32 1 - %x1 = load volatile i32, i32* %a1, align 4 - %x0 = load volatile i32, i32* %a0, align 8 - %vx0 = insertelement <2 x i32> undef, i32 %x0, i32 0 - %vx1 = insertelement <2 x i32> %vx0, i32 %x1, i32 1 - %x = bitcast <2 x i32> %vx1 to i64 - %add = add i64 %x, 1 - %vadd = bitcast i64 %add to <2 x i32> - %vx1_0 = extractelement <2 x i32> %vadd, i32 0 - %vx1_1 = extractelement <2 x i32> %vadd, i32 1 - store i32 %vx1_0, i32* %a0, align 8 - store i32 %vx1_1, i32* %a1, align 4 - ret void -} diff --git a/llvm/test/CodeGen/X86/pr36312.ll b/llvm/test/CodeGen/X86/pr36312.ll deleted file mode 100644 index 64048511ac7..00000000000 --- a/llvm/test/CodeGen/X86/pr36312.ll +++ /dev/null @@ -1,35 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s - -%struct.anon = type { i32, i32 } - -@c = common global %struct.anon zeroinitializer, align 4 -@d = local_unnamed_addr global %struct.anon* @c, align 8 -@a = common local_unnamed_addr global i32 0, align 4 -@b = common local_unnamed_addr global i32 0, align 4 - -; Function Attrs: norecurse nounwind uwtable -define void @g() local_unnamed_addr #0 { -; CHECK-LABEL: g: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: movl 4(%rax), %eax -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: incl {{.*}}(%rip) -; CHECK-NEXT: setne %cl -; CHECK-NEXT: addl %eax, %ecx -; CHECK-NEXT: movl %ecx, {{.*}}(%rip) -; CHECK-NEXT: retq -entry: - %0 = load %struct.anon*, %struct.anon** @d, align 8 - %y = getelementptr inbounds %struct.anon, %struct.anon* %0, i64 0, i32 1 - %1 = load i32, i32* %y, align 4 - %2 = load i32, i32* @b, align 4 - %inc = add nsw i32 %2, 1 - store i32 %inc, i32* @b, align 4 - %tobool = icmp ne i32 %inc, 0 - %land.ext = zext i1 %tobool to i32 - %add = add nsw i32 %1, %land.ext - store i32 %add, i32* @a, align 4 - ret void -} diff --git a/llvm/test/CodeGen/X86/required-vector-width.ll b/llvm/test/CodeGen/X86/required-vector-width.ll index dcca540b31d..257d3f0d079 100644 --- a/llvm/test/CodeGen/X86/required-vector-width.ll +++ b/llvm/test/CodeGen/X86/required-vector-width.ll @@ -39,12 +39,12 @@ define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-ve define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="256" { ; CHECK-LABEL: avg_v64i8_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rsi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 -; CHECK-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, (%rax) +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0 +; CHECK-NEXT: vmovdqa (%rsi), %ymm1 +; CHECK-NEXT: vpavgb (%rdi), %ymm1, %ymm1 +; CHECK-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0 ; CHECK-NEXT: vmovdqu %ymm0, (%rax) +; CHECK-NEXT: vmovdqu %ymm1, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %a diff --git a/llvm/test/CodeGen/X86/store_op_load_fold2.ll b/llvm/test/CodeGen/X86/store_op_load_fold2.ll index 674b8d8f938..f47d87f4bb8 100644 --- a/llvm/test/CodeGen/X86/store_op_load_fold2.ll +++ b/llvm/test/CodeGen/X86/store_op_load_fold2.ll @@ -17,14 +17,14 @@ cond_true2732.preheader: ; preds = %entry store i64 %tmp2676.us.us, i64* %tmp2666 ret i32 0 -; INTEL: and {{e..}}, dword ptr [356] -; INTEL: and dword ptr [360], {{e..}} -; FIXME: mov dword ptr [356], {{e..}} +; INTEL: and {{e..}}, dword ptr [360] +; INTEL: and dword ptr [356], {{e..}} +; FIXME: mov dword ptr [360], {{e..}} ; The above line comes out as 'mov 360, eax', but when the register is ecx it works? -; ATT: andl 356, %{{e..}} -; ATT: andl %{{e..}}, 360 -; ATT: movl %{{e..}}, 356 +; ATT: andl 360, %{{e..}} +; ATT: andl %{{e..}}, 356 +; ATT: movl %{{e..}}, 360 } diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll index a4477b2375b..bcb7d14f953 100644 --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -751,64 +751,72 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-AVX-NEXT: vmovaps %xmm1, (%eax) +; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX-NEXT: retl ; ; X32-AVX512F-LABEL: test_broadcast_4i32_8i32_chain: ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512F-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax) +; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain: ; X32-AVX512BW: # %bb.0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512BW-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax) +; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain: ; X32-AVX512DQ: # %bb.0: ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512DQ-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax) +; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain: ; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX-NEXT: retq ; ; X64-AVX512F-LABEL: test_broadcast_4i32_8i32_chain: ; X64-AVX512F: # %bb.0: +; X64-AVX512F-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi) +; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain: ; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) +; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain: ; X64-AVX512DQ: # %bb.0: +; X64-AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x float> zeroinitializer, <4 x float>* %p1 @@ -821,9 +829,10 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-AVX-NEXT: vmovaps %xmm1, (%eax) +; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; @@ -831,56 +840,63 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax) +; X32-AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain: ; X32-AVX512BW: # %bb.0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax) +; X32-AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain: ; X32-AVX512DQ: # %bb.0: ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax) +; X32-AVX512DQ-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain: ; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; ; X64-AVX512F-LABEL: test_broadcast_4i32_16i32_chain: ; X64-AVX512F: # %bb.0: +; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi) +; X64-AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain: ; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) +; X64-AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain: ; X64-AVX512DQ: # %bb.0: +; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX512DQ-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512DQ-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x float> zeroinitializer, <4 x float>* %p1 diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll index d74f925939f..6b63e11ca72 100644 --- a/llvm/test/CodeGen/X86/var-permute-256.ll +++ b/llvm/test/CodeGen/X86/var-permute-256.ll @@ -685,49 +685,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; XOP-NEXT: vmovd %eax, %xmm0 ; XOP-NEXT: vpextrb $1, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $2, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $3, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $4, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $5, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $6, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $7, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $8, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $9, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $10, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $11, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $12, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $13, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $14, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $15, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $0, %xmm1, %eax ; XOP-NEXT: andl $31, %eax ; XOP-NEXT: movzbl (%rsp,%rax), %eax @@ -797,49 +812,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpextrb $1, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $2, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $3, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $4, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $5, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $6, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $7, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $8, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $9, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $10, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $11, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $12, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $13, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $14, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $15, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax ; AVX1-NEXT: movzbl (%rsp,%rax), %eax @@ -909,49 +939,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vpextrb $1, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $2, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $3, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $4, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $5, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $6, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $7, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $8, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $9, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $10, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $11, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $12, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $13, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $14, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $15, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax ; AVX2-NEXT: movzbl (%rsp,%rax), %eax @@ -1021,49 +1066,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vpextrb $1, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $2, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $3, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $4, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $5, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $6, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $7, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $8, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $9, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $10, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $11, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $12, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $13, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $14, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $15, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $0, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax @@ -1133,49 +1193,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX512DQ-NEXT: vmovd %eax, %xmm0 ; AVX512DQ-NEXT: vpextrb $1, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $2, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $3, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $4, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $5, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $6, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $7, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $8, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $9, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $10, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $11, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $12, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $13, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $14, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $15, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $0, %xmm1, %eax ; AVX512DQ-NEXT: andl $31, %eax ; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax @@ -1245,49 +1320,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX512VL-NEXT: vmovd %eax, %xmm0 ; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax ; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax @@ -2293,49 +2383,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) ; XOP-NEXT: vmovd %eax, %xmm0 ; XOP-NEXT: vpextrb $1, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $2, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $3, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $4, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $5, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $6, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $7, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $8, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $9, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $10, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $11, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $12, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $13, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $14, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $15, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $0, %xmm1, %eax ; XOP-NEXT: andl $15, %eax ; XOP-NEXT: movzbl -24(%rsp,%rax), %eax @@ -2399,49 +2504,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpextrb $1, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $2, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $3, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $4, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $5, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $6, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $7, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $8, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $9, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $10, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $11, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $12, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $13, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $14, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $15, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm1, %eax ; AVX1-NEXT: andl $15, %eax ; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax @@ -2505,49 +2625,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vpextrb $1, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $2, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $3, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $4, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $5, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $6, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $7, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $8, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $9, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $10, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $11, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $12, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $13, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $14, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $15, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm1, %eax ; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax @@ -2611,49 +2746,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vpextrb $1, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $2, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $3, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $4, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $5, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $6, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $7, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $8, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $9, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $10, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $11, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $12, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $13, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $14, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $15, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $0, %xmm1, %eax ; AVX512F-NEXT: andl $15, %eax ; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax @@ -2717,49 +2867,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) ; AVX512DQ-NEXT: vmovd %eax, %xmm0 ; AVX512DQ-NEXT: vpextrb $1, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $2, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $3, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $4, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $5, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $6, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $7, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $8, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $9, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $10, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $11, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $12, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $13, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $14, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $15, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $0, %xmm1, %eax ; AVX512DQ-NEXT: andl $15, %eax ; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax @@ -2823,49 +2988,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) ; AVX512VL-NEXT: vmovd %eax, %xmm0 ; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax ; AVX512VL-NEXT: andl $15, %eax ; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll index b4950ee49fc..c726a149175 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -47,7 +47,8 @@ define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, ; ALL-NEXT: andl $3, %edx ; ALL-NEXT: andl $3, %esi ; ALL-NEXT: vmovaps %ymm0, (%rsp) -; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; ALL-NEXT: movq %rbp, %rsp |