diff options
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 16 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 131 | ||||
-rw-r--r-- | llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll | 17 | ||||
-rw-r--r-- | llvm/test/CodeGen/ARM/memcpy-inline.ll | 15 | ||||
-rw-r--r-- | llvm/test/CodeGen/ARM/vector-load.ll | 253 | ||||
-rw-r--r-- | llvm/test/CodeGen/ARM/vector-store.ll | 258 | ||||
-rw-r--r-- | llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll | 4 |
7 files changed, 654 insertions, 40 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 0cbfe488ac4..6410b338238 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -992,18 +992,24 @@ bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr, Addr = N; unsigned Alignment = 0; - if (LSBaseSDNode *LSN = dyn_cast<LSBaseSDNode>(Parent)) { + + MemSDNode *MemN = cast<MemSDNode>(Parent); + + if (isa<LSBaseSDNode>(MemN) || + ((MemN->getOpcode() == ARMISD::VST1_UPD || + MemN->getOpcode() == ARMISD::VLD1_UPD) && + MemN->getConstantOperandVal(MemN->getNumOperands() - 1) == 1)) { // This case occurs only for VLD1-lane/dup and VST1-lane instructions. // The maximum alignment is equal to the memory size being referenced. - unsigned LSNAlign = LSN->getAlignment(); - unsigned MemSize = LSN->getMemoryVT().getSizeInBits() / 8; - if (LSNAlign >= MemSize && MemSize > 1) + unsigned MMOAlign = MemN->getAlignment(); + unsigned MemSize = MemN->getMemoryVT().getSizeInBits() / 8; + if (MMOAlign >= MemSize && MemSize > 1) Alignment = MemSize; } else { // All other uses of addrmode6 are for intrinsics. For now just record // the raw alignment value; it will be refined later based on the legal // alignment operands for the intrinsic. - Alignment = cast<MemIntrinsicSDNode>(Parent)->getAlignment(); + Alignment = MemN->getAlignment(); } Align = CurDAG->getTargetConstant(Alignment, MVT::i32); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index f07af201e6e..4f9bdc0ab76 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -565,6 +565,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); + setTargetDAGCombine(ISD::LOAD); // It is legal to extload from v4i8 to v4i16 or v4i32. MVT Tys[6] = {MVT::v8i8, MVT::v4i8, MVT::v2i8, @@ -8872,17 +8873,18 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { DAG.getUNDEF(VT), NewMask.data()); } -/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and -/// NEON load/store intrinsics to merge base address updates. +/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, +/// NEON load/store intrinsics, and generic vector load/stores, to merge +/// base address updates. +/// For generic load/stores, the memory type is assumed to be a vector. +/// The caller is assumed to have checked legality. static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { - if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) - return SDValue(); - SelectionDAG &DAG = DCI.DAG; const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || N->getOpcode() == ISD::INTRINSIC_W_CHAIN); - const unsigned AddrOpIdx = (isIntrinsic ? 2 : 1); + const bool isStore = N->getOpcode() == ISD::STORE; + const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); SDValue Addr = N->getOperand(AddrOpIdx); MemSDNode *MemN = cast<MemSDNode>(N); @@ -8944,15 +8946,24 @@ static SDValue CombineBaseUpdate(SDNode *N, case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; + case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; + NumVecs = 1; isLaneOp = false; break; + case ISD::STORE: NewOpc = ARMISD::VST1_UPD; + NumVecs = 1; isLaneOp = false; isLoadOp = false; break; } } // Find the size of memory referenced by the load/store. EVT VecTy; - if (isLoadOp) + if (isLoadOp) { VecTy = N->getValueType(0); - else + } else if (isIntrinsic) { VecTy = N->getOperand(AddrOpIdx+1).getValueType(); + } else { + assert(isStore && "Node has to be a load, a store, or an intrinsic!"); + VecTy = N->getOperand(1).getValueType(); + } + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; if (isLaneOp) NumBytes /= VecTy.getVectorNumElements(); @@ -8969,13 +8980,53 @@ static SDValue CombineBaseUpdate(SDNode *N, continue; } + // OK, we found an ADD we can fold into the base update. + // Now, create a _UPD node, taking care of not breaking alignment. + + EVT AlignedVecTy = VecTy; + unsigned Alignment = MemN->getAlignment(); + + // If this is a less-than-standard-aligned load/store, change the type to + // match the standard alignment. + // The alignment is overlooked when selecting _UPD variants; and it's + // easier to introduce bitcasts here than fix that. + // There are 3 ways to get to this base-update combine: + // - intrinsics: they are assumed to be properly aligned (to the standard + // alignment of the memory type), so we don't need to do anything. + // - ARMISD::VLDx nodes: they are only generated from the aforementioned + // intrinsics, so, likewise, there's nothing to do. + // - generic load/store instructions: the alignment is specified as an + // explicit operand, rather than implicitly as the standard alignment + // of the memory type (like the intrisics). We need to change the + // memory type to match the explicit alignment. That way, we don't + // generate non-standard-aligned ARMISD::VLDx nodes. + if (isa<LSBaseSDNode>(N)) { + if (Alignment == 0) + Alignment = 1; + if (Alignment < VecTy.getScalarSizeInBits() / 8) { + MVT EltTy = MVT::getIntegerVT(Alignment * 8); + assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); + assert(!isLaneOp && "Unexpected generic load/store lane."); + unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); + AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); + } + // Don't set an explicit alignment on regular load/stores that we want + // to transform to VLD/VST 1_UPD nodes. + // This matches the behavior of regular load/stores, which only get an + // explicit alignment if the MMO alignment is larger than the standard + // alignment of the memory type. + // Intrinsics, however, always get an explicit alignment, set to the + // alignment of the MMO. + Alignment = 1; + } + // Create the new updating load/store node. // First, create an SDVTList for the new updating node's results. EVT Tys[6]; unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); unsigned n; for (n = 0; n < NumResultVecs; ++n) - Tys[n] = VecTy; + Tys[n] = AlignedVecTy; Tys[n++] = MVT::i32; Tys[n] = MVT::Other; SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); @@ -8985,17 +9036,43 @@ static SDValue CombineBaseUpdate(SDNode *N, Ops.push_back(N->getOperand(0)); // incoming chain Ops.push_back(N->getOperand(AddrOpIdx)); Ops.push_back(Inc); - for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) - Ops.push_back(N->getOperand(i)); + + if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { + // Try to match the intrinsic's signature + Ops.push_back(StN->getValue()); + } else { + // Loads (and of course intrinsics) match the intrinsics' signature, + // so just add all but the alignment operand. + for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) + Ops.push_back(N->getOperand(i)); + } + + // For all node types, the alignment operand is always the last one. + Ops.push_back(DAG.getConstant(Alignment, MVT::i32)); + + // If this is a non-standard-aligned STORE, the penultimate operand is the + // stored value. Bitcast it to the aligned type. + if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { + SDValue &StVal = Ops[Ops.size()-2]; + StVal = DAG.getNode(ISD::BITCAST, SDLoc(N), AlignedVecTy, StVal); + } SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, - Ops, MemN->getMemoryVT(), + Ops, AlignedVecTy, MemN->getMemOperand()); // Update the uses. SmallVector<SDValue, 5> NewResults; for (unsigned i = 0; i < NumResultVecs; ++i) NewResults.push_back(SDValue(UpdN.getNode(), i)); + + // If this is an non-standard-aligned LOAD, the first result is the loaded + // value. Bitcast it to the expected result type. + if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { + SDValue &LdVal = NewResults[0]; + LdVal = DAG.getNode(ISD::BITCAST, SDLoc(N), VecTy, LdVal); + } + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain DCI.CombineTo(N, NewResults); DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); @@ -9005,6 +9082,14 @@ static SDValue CombineBaseUpdate(SDNode *N, return SDValue(); } +static SDValue PerformVLDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + return CombineBaseUpdate(N, DCI); +} + /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and @@ -9118,6 +9203,18 @@ static SDValue PerformVDUPLANECombine(SDNode *N, return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); } +static SDValue PerformLOADCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + + // If this is a legal vector load, try to combine it into a VLD1_UPD. + if (ISD::isNormalLoad(N) && VT.isVector() && + DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return CombineBaseUpdate(N, DCI); + + return SDValue(); +} + /// PerformSTORECombine - Target-specific dag combine xforms for /// ISD::STORE. static SDValue PerformSTORECombine(SDNode *N, @@ -9256,6 +9353,11 @@ static SDValue PerformSTORECombine(SDNode *N, St->getAAInfo()); } + // If this is a legal vector store, try to combine it into a VST1_UPD. + if (ISD::isNormalStore(N) && VT.isVector() && + DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return CombineBaseUpdate(N, DCI); + return SDValue(); } @@ -9849,10 +9951,11 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget); case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); + case ISD::LOAD: return PerformLOADCombine(N, DCI); case ARMISD::VLD2DUP: case ARMISD::VLD3DUP: case ARMISD::VLD4DUP: - return CombineBaseUpdate(N, DCI); + return PerformVLDCombine(N, DCI); case ARMISD::BUILD_VECTOR: return PerformARMBUILD_VECTORCombine(N, DCI); case ISD::INTRINSIC_VOID: @@ -9872,7 +9975,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::arm_neon_vst2lane: case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: - return CombineBaseUpdate(N, DCI); + return PerformVLDCombine(N, DCI); default: break; } break; diff --git a/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll b/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll index 24c28baff88..5ad87191efe 100644 --- a/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll +++ b/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll @@ -9,8 +9,8 @@ define void @test1(<16 x float>* noalias sret %agg.result) nounwind ssp "no-realign-stack" { entry: ; NO-REALIGN-LABEL: test1 -; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1:[0-9]+]]:128] -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #16 +; NO-REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]] +; NO-REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! ; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] ; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32 ; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] @@ -21,16 +21,14 @@ entry: ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] ; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32 ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #16 -; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]! ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] ; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0:0]], #48 ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] ; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0]], #32 ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #16 -; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]! ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128] %retval = alloca <16 x float>, align 16 %0 = load <16 x float>* @T3_retval, align 16 @@ -44,8 +42,8 @@ define void @test2(<16 x float>* noalias sret %agg.result) nounwind ssp { entry: ; REALIGN-LABEL: test2 ; REALIGN: bfc sp, #0, #6 -; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1:[0-9]+]]:128] -; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #16 +; REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]] +; REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! ; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] ; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32 ; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] @@ -65,8 +63,7 @@ entry: ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] ; REALIGN: add r[[R1:[0-9]+]], r[[R0]], #32 ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] -; REALIGN: add r[[R1:[0-9]+]], r[[R0]], #16 -; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]! ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128] %retval = alloca <16 x float>, align 16 %0 = load <16 x float>* @T3_retval, align 16 diff --git a/llvm/test/CodeGen/ARM/memcpy-inline.ll b/llvm/test/CodeGen/ARM/memcpy-inline.ll index 84ce4a7f0e7..33ac4e12563 100644 --- a/llvm/test/CodeGen/ARM/memcpy-inline.ll +++ b/llvm/test/CodeGen/ARM/memcpy-inline.ll @@ -46,10 +46,8 @@ entry: ; CHECK: movw [[REG2:r[0-9]+]], #16716 ; CHECK: movt [[REG2:r[0-9]+]], #72 ; CHECK: str [[REG2]], [r0, #32] -; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] -; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] -; CHECK: adds r0, #16 -; CHECK: adds r1, #16 +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]! +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]! ; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false) @@ -59,10 +57,8 @@ entry: define void @t3(i8* nocapture %C) nounwind { entry: ; CHECK-LABEL: t3: -; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] -; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] -; CHECK: adds r0, #16 -; CHECK: adds r1, #16 +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]! +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]! ; CHECK: vld1.8 {d{{[0-9]+}}}, [r1] ; CHECK: vst1.8 {d{{[0-9]+}}}, [r0] tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false) @@ -73,7 +69,8 @@ define void @t4(i8* nocapture %C) nounwind { entry: ; CHECK-LABEL: t4: ; CHECK: vld1.8 {[[REG3:d[0-9]+]], [[REG4:d[0-9]+]]}, [r1] -; CHECK: vst1.8 {[[REG3]], [[REG4]]}, [r0] +; CHECK: vst1.8 {[[REG3]], [[REG4]]}, [r0]! +; CHECK: strh [[REG5:r[0-9]+]], [r0] tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false) ret void } diff --git a/llvm/test/CodeGen/ARM/vector-load.ll b/llvm/test/CodeGen/ARM/vector-load.ll new file mode 100644 index 00000000000..c177a55d7a1 --- /dev/null +++ b/llvm/test/CodeGen/ARM/vector-load.ll @@ -0,0 +1,253 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:o-p:32:32-i1:8:32-i8:8:32-i16:16:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32" +target triple = "thumbv7s-apple-ios8.0.0" + +define <8 x i8> @load_v8i8(<8 x i8>** %ptr) { +;CHECK-LABEL: load_v8i8: +;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <8 x i8>** %ptr + %lA = load <8 x i8>* %A, align 1 + ret <8 x i8> %lA +} + +define <8 x i8> @load_v8i8_update(<8 x i8>** %ptr) { +;CHECK-LABEL: load_v8i8_update: +;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <8 x i8>** %ptr + %lA = load <8 x i8>* %A, align 1 + %inc = getelementptr <8 x i8>* %A, i38 1 + store <8 x i8>* %inc, <8 x i8>** %ptr + ret <8 x i8> %lA +} + +define <4 x i16> @load_v4i16(<4 x i16>** %ptr) { +;CHECK-LABEL: load_v4i16: +;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <4 x i16>** %ptr + %lA = load <4 x i16>* %A, align 1 + ret <4 x i16> %lA +} + +define <4 x i16> @load_v4i16_update(<4 x i16>** %ptr) { +;CHECK-LABEL: load_v4i16_update: +;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <4 x i16>** %ptr + %lA = load <4 x i16>* %A, align 1 + %inc = getelementptr <4 x i16>* %A, i34 1 + store <4 x i16>* %inc, <4 x i16>** %ptr + ret <4 x i16> %lA +} + +define <2 x i32> @load_v2i32(<2 x i32>** %ptr) { +;CHECK-LABEL: load_v2i32: +;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <2 x i32>** %ptr + %lA = load <2 x i32>* %A, align 1 + ret <2 x i32> %lA +} + +define <2 x i32> @load_v2i32_update(<2 x i32>** %ptr) { +;CHECK-LABEL: load_v2i32_update: +;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x i32>** %ptr + %lA = load <2 x i32>* %A, align 1 + %inc = getelementptr <2 x i32>* %A, i32 1 + store <2 x i32>* %inc, <2 x i32>** %ptr + ret <2 x i32> %lA +} + +define <2 x float> @load_v2f32(<2 x float>** %ptr) { +;CHECK-LABEL: load_v2f32: +;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <2 x float>** %ptr + %lA = load <2 x float>* %A, align 1 + ret <2 x float> %lA +} + +define <2 x float> @load_v2f32_update(<2 x float>** %ptr) { +;CHECK-LABEL: load_v2f32_update: +;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x float>** %ptr + %lA = load <2 x float>* %A, align 1 + %inc = getelementptr <2 x float>* %A, i32 1 + store <2 x float>* %inc, <2 x float>** %ptr + ret <2 x float> %lA +} + +define <1 x i64> @load_v1i64(<1 x i64>** %ptr) { +;CHECK-LABEL: load_v1i64: +;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <1 x i64>** %ptr + %lA = load <1 x i64>* %A, align 1 + ret <1 x i64> %lA +} + +define <1 x i64> @load_v1i64_update(<1 x i64>** %ptr) { +;CHECK-LABEL: load_v1i64_update: +;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <1 x i64>** %ptr + %lA = load <1 x i64>* %A, align 1 + %inc = getelementptr <1 x i64>* %A, i31 1 + store <1 x i64>* %inc, <1 x i64>** %ptr + ret <1 x i64> %lA +} + +define <16 x i8> @load_v16i8(<16 x i8>** %ptr) { +;CHECK-LABEL: load_v16i8: +;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <16 x i8>** %ptr + %lA = load <16 x i8>* %A, align 1 + ret <16 x i8> %lA +} + +define <16 x i8> @load_v16i8_update(<16 x i8>** %ptr) { +;CHECK-LABEL: load_v16i8_update: +;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <16 x i8>** %ptr + %lA = load <16 x i8>* %A, align 1 + %inc = getelementptr <16 x i8>* %A, i316 1 + store <16 x i8>* %inc, <16 x i8>** %ptr + ret <16 x i8> %lA +} + +define <8 x i16> @load_v8i16(<8 x i16>** %ptr) { +;CHECK-LABEL: load_v8i16: +;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <8 x i16>** %ptr + %lA = load <8 x i16>* %A, align 1 + ret <8 x i16> %lA +} + +define <8 x i16> @load_v8i16_update(<8 x i16>** %ptr) { +;CHECK-LABEL: load_v8i16_update: +;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <8 x i16>** %ptr + %lA = load <8 x i16>* %A, align 1 + %inc = getelementptr <8 x i16>* %A, i38 1 + store <8 x i16>* %inc, <8 x i16>** %ptr + ret <8 x i16> %lA +} + +define <4 x i32> @load_v4i32(<4 x i32>** %ptr) { +;CHECK-LABEL: load_v4i32: +;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <4 x i32>** %ptr + %lA = load <4 x i32>* %A, align 1 + ret <4 x i32> %lA +} + +define <4 x i32> @load_v4i32_update(<4 x i32>** %ptr) { +;CHECK-LABEL: load_v4i32_update: +;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <4 x i32>** %ptr + %lA = load <4 x i32>* %A, align 1 + %inc = getelementptr <4 x i32>* %A, i34 1 + store <4 x i32>* %inc, <4 x i32>** %ptr + ret <4 x i32> %lA +} + +define <4 x float> @load_v4f32(<4 x float>** %ptr) { +;CHECK-LABEL: load_v4f32: +;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <4 x float>** %ptr + %lA = load <4 x float>* %A, align 1 + ret <4 x float> %lA +} + +define <4 x float> @load_v4f32_update(<4 x float>** %ptr) { +;CHECK-LABEL: load_v4f32_update: +;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <4 x float>** %ptr + %lA = load <4 x float>* %A, align 1 + %inc = getelementptr <4 x float>* %A, i34 1 + store <4 x float>* %inc, <4 x float>** %ptr + ret <4 x float> %lA +} + +define <2 x i64> @load_v2i64(<2 x i64>** %ptr) { +;CHECK-LABEL: load_v2i64: +;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <2 x i64>** %ptr + %lA = load <2 x i64>* %A, align 1 + ret <2 x i64> %lA +} + +define <2 x i64> @load_v2i64_update(<2 x i64>** %ptr) { +;CHECK-LABEL: load_v2i64_update: +;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x i64>** %ptr + %lA = load <2 x i64>* %A, align 1 + %inc = getelementptr <2 x i64>* %A, i32 1 + store <2 x i64>* %inc, <2 x i64>** %ptr + ret <2 x i64> %lA +} + +; Make sure we change the type to match alignment if necessary. +define <2 x i64> @load_v2i64_update_aligned2(<2 x i64>** %ptr) { +;CHECK-LABEL: load_v2i64_update_aligned2: +;CHECK: vld1.16 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x i64>** %ptr + %lA = load <2 x i64>* %A, align 2 + %inc = getelementptr <2 x i64>* %A, i32 1 + store <2 x i64>* %inc, <2 x i64>** %ptr + ret <2 x i64> %lA +} + +define <2 x i64> @load_v2i64_update_aligned4(<2 x i64>** %ptr) { +;CHECK-LABEL: load_v2i64_update_aligned4: +;CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x i64>** %ptr + %lA = load <2 x i64>* %A, align 4 + %inc = getelementptr <2 x i64>* %A, i32 1 + store <2 x i64>* %inc, <2 x i64>** %ptr + ret <2 x i64> %lA +} + +define <2 x i64> @load_v2i64_update_aligned8(<2 x i64>** %ptr) { +;CHECK-LABEL: load_v2i64_update_aligned8: +;CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x i64>** %ptr + %lA = load <2 x i64>* %A, align 8 + %inc = getelementptr <2 x i64>* %A, i32 1 + store <2 x i64>* %inc, <2 x i64>** %ptr + ret <2 x i64> %lA +} + +define <2 x i64> @load_v2i64_update_aligned16(<2 x i64>** %ptr) { +;CHECK-LABEL: load_v2i64_update_aligned16: +;CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}:128]! + %A = load <2 x i64>** %ptr + %lA = load <2 x i64>* %A, align 16 + %inc = getelementptr <2 x i64>* %A, i32 1 + store <2 x i64>* %inc, <2 x i64>** %ptr + ret <2 x i64> %lA +} + +; Make sure we don't break smaller-than-dreg extloads. +define <4 x i32> @zextload_v8i8tov8i32(<4 x i8>** %ptr) { +;CHECK-LABEL: zextload_v8i8tov8i32: +;CHECK: vld1.32 {{{d[0-9]+}}[0]}, [{{r[0-9]+}}:32] +;CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} +;CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}} + %A = load <4 x i8>** %ptr + %lA = load <4 x i8>* %A, align 4 + %zlA = zext <4 x i8> %lA to <4 x i32> + ret <4 x i32> %zlA +} + +define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) { +;CHECK-LABEL: zextload_v8i8tov8i32_fake_update: +;CHECK: ldr.w r[[PTRREG:[0-9]+]], [r0] +;CHECK: vld1.32 {{{d[0-9]+}}[0]}, [r[[PTRREG]]:32] +;CHECK: add.w r[[INCREG:[0-9]+]], r[[PTRREG]], #16 +;CHECK: str.w r[[INCREG]], [r0] +;CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} +;CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}} + %A = load <4 x i8>** %ptr + %lA = load <4 x i8>* %A, align 4 + %inc = getelementptr <4 x i8>* %A, i38 4 + store <4 x i8>* %inc, <4 x i8>** %ptr + %zlA = zext <4 x i8> %lA to <4 x i32> + ret <4 x i32> %zlA +} diff --git a/llvm/test/CodeGen/ARM/vector-store.ll b/llvm/test/CodeGen/ARM/vector-store.ll new file mode 100644 index 00000000000..55cb8f26658 --- /dev/null +++ b/llvm/test/CodeGen/ARM/vector-store.ll @@ -0,0 +1,258 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:o-p:32:32-i1:8:32-i8:8:32-i16:16:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32" +target triple = "thumbv7s-apple-ios8.0.0" + +define void @store_v8i8(<8 x i8>** %ptr, <8 x i8> %val) { +;CHECK-LABEL: store_v8i8: +;CHECK: str r1, [r0] + %A = load <8 x i8>** %ptr + store <8 x i8> %val, <8 x i8>* %A, align 1 + ret void +} + +define void @store_v8i8_update(<8 x i8>** %ptr, <8 x i8> %val) { +;CHECK-LABEL: store_v8i8_update: +;CHECK: vst1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <8 x i8>** %ptr + store <8 x i8> %val, <8 x i8>* %A, align 1 + %inc = getelementptr <8 x i8>* %A, i38 1 + store <8 x i8>* %inc, <8 x i8>** %ptr + ret void +} + +define void @store_v4i16(<4 x i16>** %ptr, <4 x i16> %val) { +;CHECK-LABEL: store_v4i16: +;CHECK: str r1, [r0] + %A = load <4 x i16>** %ptr + store <4 x i16> %val, <4 x i16>* %A, align 1 + ret void +} + +define void @store_v4i16_update(<4 x i16>** %ptr, <4 x i16> %val) { +;CHECK-LABEL: store_v4i16_update: +;CHECK: vst1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <4 x i16>** %ptr + store <4 x i16> %val, <4 x i16>* %A, align 1 + %inc = getelementptr <4 x i16>* %A, i34 1 + store <4 x i16>* %inc, <4 x i16>** %ptr + ret void +} + +define void @store_v2i32(<2 x i32>** %ptr, <2 x i32> %val) { +;CHECK-LABEL: store_v2i32: +;CHECK: str r1, [r0] + %A = load <2 x i32>** %ptr + store <2 x i32> %val, <2 x i32>* %A, align 1 + ret void +} + +define void @store_v2i32_update(<2 x i32>** %ptr, <2 x i32> %val) { +;CHECK-LABEL: store_v2i32_update: +;CHECK: vst1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x i32>** %ptr + store <2 x i32> %val, <2 x i32>* %A, align 1 + %inc = getelementptr <2 x i32>* %A, i32 1 + store <2 x i32>* %inc, <2 x i32>** %ptr + ret void +} + +define void @store_v2f32(<2 x float>** %ptr, <2 x float> %val) { +;CHECK-LABEL: store_v2f32: +;CHECK: str r1, [r0] + %A = load <2 x float>** %ptr + store <2 x float> %val, <2 x float>* %A, align 1 + ret void +} + +define void @store_v2f32_update(<2 x float>** %ptr, <2 x float> %val) { +;CHECK-LABEL: store_v2f32_update: +;CHECK: vst1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x float>** %ptr + store <2 x float> %val, <2 x float>* %A, align 1 + %inc = getelementptr <2 x float>* %A, i32 1 + store <2 x float>* %inc, <2 x float>** %ptr + ret void +} + +define void @store_v1i64(<1 x i64>** %ptr, <1 x i64> %val) { +;CHECK-LABEL: store_v1i64: +;CHECK: str r1, [r0] + %A = load <1 x i64>** %ptr + store <1 x i64> %val, <1 x i64>* %A, align 1 + ret void +} + +define void @store_v1i64_update(<1 x i64>** %ptr, <1 x i64> %val) { +;CHECK-LABEL: store_v1i64_update: +;CHECK: vst1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <1 x i64>** %ptr + store <1 x i64> %val, <1 x i64>* %A, align 1 + %inc = getelementptr <1 x i64>* %A, i31 1 + store <1 x i64>* %inc, <1 x i64>** %ptr + ret void +} + +define void @store_v16i8(<16 x i8>** %ptr, <16 x i8> %val) { +;CHECK-LABEL: store_v16i8: +;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <16 x i8>** %ptr + store <16 x i8> %val, <16 x i8>* %A, align 1 + ret void +} + +define void @store_v16i8_update(<16 x i8>** %ptr, <16 x i8> %val) { +;CHECK-LABEL: store_v16i8_update: +;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <16 x i8>** %ptr + store <16 x i8> %val, <16 x i8>* %A, align 1 + %inc = getelementptr <16 x i8>* %A, i316 1 + store <16 x i8>* %inc, <16 x i8>** %ptr + ret void +} + +define void @store_v8i16(<8 x i16>** %ptr, <8 x i16> %val) { +;CHECK-LABEL: store_v8i16: +;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <8 x i16>** %ptr + store <8 x i16> %val, <8 x i16>* %A, align 1 + ret void +} + +define void @store_v8i16_update(<8 x i16>** %ptr, <8 x i16> %val) { +;CHECK-LABEL: store_v8i16_update: +;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <8 x i16>** %ptr + store <8 x i16> %val, <8 x i16>* %A, align 1 + %inc = getelementptr <8 x i16>* %A, i38 1 + store <8 x i16>* %inc, <8 x i16>** %ptr + ret void +} + +define void @store_v4i32(<4 x i32>** %ptr, <4 x i32> %val) { +;CHECK-LABEL: store_v4i32: +;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <4 x i32>** %ptr + store <4 x i32> %val, <4 x i32>* %A, align 1 + ret void +} + +define void @store_v4i32_update(<4 x i32>** %ptr, <4 x i32> %val) { +;CHECK-LABEL: store_v4i32_update: +;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <4 x i32>** %ptr + store <4 x i32> %val, <4 x i32>* %A, align 1 + %inc = getelementptr <4 x i32>* %A, i34 1 + store <4 x i32>* %inc, <4 x i32>** %ptr + ret void +} + +define void @store_v4f32(<4 x float>** %ptr, <4 x float> %val) { +;CHECK-LABEL: store_v4f32: +;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <4 x float>** %ptr + store <4 x float> %val, <4 x float>* %A, align 1 + ret void +} + +define void @store_v4f32_update(<4 x float>** %ptr, <4 x float> %val) { +;CHECK-LABEL: store_v4f32_update: +;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <4 x float>** %ptr + store <4 x float> %val, <4 x float>* %A, align 1 + %inc = getelementptr <4 x float>* %A, i34 1 + store <4 x float>* %inc, <4 x float>** %ptr + ret void +} + +define void @store_v2i64(<2 x i64>** %ptr, <2 x i64> %val) { +;CHECK-LABEL: store_v2i64: +;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}] + %A = load <2 x i64>** %ptr + store <2 x i64> %val, <2 x i64>* %A, align 1 + ret void +} + +define void @store_v2i64_update(<2 x i64>** %ptr, <2 x i64> %val) { +;CHECK-LABEL: store_v2i64_update: +;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x i64>** %ptr + store <2 x i64> %val, <2 x i64>* %A, align 1 + %inc = getelementptr <2 x i64>* %A, i32 1 + store <2 x i64>* %inc, <2 x i64>** %ptr + ret void +} + +define void @store_v2i64_update_aligned2(<2 x i64>** %ptr, <2 x i64> %val) { +;CHECK-LABEL: store_v2i64_update_aligned2: +;CHECK: vst1.16 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x i64>** %ptr + store <2 x i64> %val, <2 x i64>* %A, align 2 + %inc = getelementptr <2 x i64>* %A, i32 1 + store <2 x i64>* %inc, <2 x i64>** %ptr + ret void +} + +define void @store_v2i64_update_aligned4(<2 x i64>** %ptr, <2 x i64> %val) { +;CHECK-LABEL: store_v2i64_update_aligned4: +;CHECK: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x i64>** %ptr + store <2 x i64> %val, <2 x i64>* %A, align 4 + %inc = getelementptr <2 x i64>* %A, i32 1 + store <2 x i64>* %inc, <2 x i64>** %ptr + ret void +} + +define void @store_v2i64_update_aligned8(<2 x i64>** %ptr, <2 x i64> %val) { +;CHECK-LABEL: store_v2i64_update_aligned8: +;CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]! + %A = load <2 x i64>** %ptr + store <2 x i64> %val, <2 x i64>* %A, align 8 + %inc = getelementptr <2 x i64>* %A, i32 1 + store <2 x i64>* %inc, <2 x i64>** %ptr + ret void +} + +define void @store_v2i64_update_aligned16(<2 x i64>** %ptr, <2 x i64> %val) { +;CHECK-LABEL: store_v2i64_update_aligned16: +;CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}:128]! + %A = load <2 x i64>** %ptr + store <2 x i64> %val, <2 x i64>* %A, align 16 + %inc = getelementptr <2 x i64>* %A, i32 1 + store <2 x i64>* %inc, <2 x i64>** %ptr + ret void +} + +define void @truncstore_v4i32tov4i8(<4 x i8>** %ptr, <4 x i32> %val) { +;CHECK-LABEL: truncstore_v4i32tov4i8: +;CHECK: ldr.w r9, [sp] +;CHECK: vmov {{d[0-9]+}}, r3, r9 +;CHECK: vmov {{d[0-9]+}}, r1, r2 +;CHECK: vmovn.i32 [[VECLO:d[0-9]+]], {{q[0-9]+}} +;CHECK: vuzp.8 [[VECLO]], {{d[0-9]+}} +;CHECK: ldr r[[PTRREG:[0-9]+]], [r0] +;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32] + %A = load <4 x i8>** %ptr + %trunc = trunc <4 x i32> %val to <4 x i8> + store <4 x i8> %trunc, <4 x i8>* %A, align 4 + ret void +} + +define void @truncstore_v4i32tov4i8_fake_update(<4 x i8>** %ptr, <4 x i32> %val) { +;CHECK-LABEL: truncstore_v4i32tov4i8_fake_update: +;CHECK: ldr.w r9, [sp] +;CHECK: vmov {{d[0-9]+}}, r3, r9 +;CHECK: vmov {{d[0-9]+}}, r1, r2 +;CHECK: movs [[IMM16:r[0-9]+]], #16 +;CHECK: vmovn.i32 [[VECLO:d[0-9]+]], {{q[0-9]+}} +;CHECK: vuzp.8 [[VECLO]], {{d[0-9]+}} +;CHECK: ldr r[[PTRREG:[0-9]+]], [r0] +;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32], [[IMM16]] +;CHECK: str r[[PTRREG]], [r0] + %A = load <4 x i8>** %ptr + %trunc = trunc <4 x i32> %val to <4 x i8> + store <4 x i8> %trunc, <4 x i8>* %A, align 4 + %inc = getelementptr <4 x i8>* %A, i38 4 + store <4 x i8>* %inc, <4 x i8>** %ptr + ret void +} diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll index f4edf092641..26b294042d4 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll @@ -201,7 +201,7 @@ for.end: ; preds = %for.body ; ; Currently we have three extra add.w's that keep the store address ; live past the next increment because ISEL is unfortunately undoing -; the store chain. ISEL also fails to convert the stores to +; the store chain. ISEL also fails to convert all but one of the stores to ; post-increment addressing. However, the loads should use ; post-increment addressing, no add's or add.w's beyond the three ; mentioned. Most importantly, there should be no spills or reloads! @@ -210,7 +210,7 @@ for.end: ; preds = %for.body ; A9: %.lr.ph ; A9-NOT: lsl.w ; A9-NOT: {{ldr|str|adds|add r}} -; A9: add.w r +; A9: vst1.8 {{.*}} [r{{[0-9]+}}]! ; A9-NOT: {{ldr|str|adds|add r}} ; A9: add.w r ; A9-NOT: {{ldr|str|adds|add r}} |