diff options
4 files changed, 97 insertions, 48 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index b8833e5a555..97df096143f 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -509,8 +509,10 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); // ldN/stN only support legal vector types of size 64 or 128 in bits. - if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128)) - return Factor; + // Accesses having vector types that are a multiple of 128 bits can be + // matched to more than one ldN/stN instruction. + if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize % 128 == 0)) + return Factor * ((SubVecSize + 127) / 128); } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 90e4f3a7cc6..ea923c53b00 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -533,9 +533,11 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); // vldN/vstN only support legal vector types of size 64 or 128 in bits. - if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128) && + // Accesses having vector types that are a multiple of 128 bits can be + // matched to more than one vldN/vstN instruction. + if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize % 128 == 0) && !VecTy->getScalarType()->isHalfTy()) - return Factor; + return Factor * ((SubVecSize + 127) / 128); } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll index 796f8c2bda8..5add5a24d76 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -44,15 +44,20 @@ entry: br label %for.body ; VF_4-LABEL: Checking a loop in "i16_factor_2" -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 -; VF_8-LABEL: Checking a loop in "i16_factor_2" -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_8-LABEL: Checking a loop in "i16_factor_2" +; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_16-LABEL: Checking a loop in "i16_factor_2" +; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0 @@ -74,16 +79,26 @@ define void @i32_factor_2(%i32.2* %data, i64 %n) { entry: br label %for.body -; VF_2-LABEL: Checking a loop in "i32_factor_2" -; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 -; VF_4-LABEL: Checking a loop in "i32_factor_2" -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_2-LABEL: Checking a loop in "i32_factor_2" +; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_4-LABEL: Checking a loop in "i32_factor_2" +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_8-LABEL: Checking a loop in "i32_factor_2" +; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_16-LABEL: Checking a loop in "i32_factor_2" +; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0 @@ -105,11 +120,26 @@ define void @i64_factor_2(%i64.2* %data, i64 %n) { entry: br label %for.body -; VF_2-LABEL: Checking a loop in "i64_factor_2" -; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_2-LABEL: Checking a loop in "i64_factor_2" +; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_4-LABEL: Checking a loop in "i64_factor_2" +; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 4 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_8-LABEL: Checking a loop in "i64_factor_2" +; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_16-LABEL: Checking a loop in "i64_factor_2" +; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll index 685c523d495..c85dc63c31a 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll @@ -43,16 +43,21 @@ define void @i16_factor_2(%i16.2* %data, i64 %n) { entry: br label %for.body -; VF_4-LABEL: Checking a loop in "i16_factor_2" -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 -; VF_8-LABEL: Checking a loop in "i16_factor_2" -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_4-LABEL: Checking a loop in "i16_factor_2" +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_8-LABEL: Checking a loop in "i16_factor_2" +; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_16-LABEL: Checking a loop in "i16_factor_2" +; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0 @@ -74,16 +79,26 @@ define void @i32_factor_2(%i32.2* %data, i64 %n) { entry: br label %for.body -; VF_2-LABEL: Checking a loop in "i32_factor_2" -; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 -; VF_4-LABEL: Checking a loop in "i32_factor_2" -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_2-LABEL: Checking a loop in "i32_factor_2" +; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_4-LABEL: Checking a loop in "i32_factor_2" +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_8-LABEL: Checking a loop in "i32_factor_2" +; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_16-LABEL: Checking a loop in "i32_factor_2" +; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0 |