summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp6
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp6
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll78
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll55
4 files changed, 97 insertions, 48 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b8833e5a555..97df096143f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -509,8 +509,10 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
// ldN/stN only support legal vector types of size 64 or 128 in bits.
- if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
- return Factor;
+ // Accesses having vector types that are a multiple of 128 bits can be
+ // matched to more than one ldN/stN instruction.
+ if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize % 128 == 0))
+ return Factor * ((SubVecSize + 127) / 128);
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 90e4f3a7cc6..ea923c53b00 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -533,9 +533,11 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
// vldN/vstN only support legal vector types of size 64 or 128 in bits.
- if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128) &&
+ // Accesses having vector types that are a multiple of 128 bits can be
+ // matched to more than one vldN/vstN instruction.
+ if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize % 128 == 0) &&
!VecTy->getScalarType()->isHalfTy())
- return Factor;
+ return Factor * ((SubVecSize + 127) / 128);
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
index 796f8c2bda8..5add5a24d76 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -44,15 +44,20 @@ entry:
br label %for.body
; VF_4-LABEL: Checking a loop in "i16_factor_2"
-; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_8-LABEL: Checking a loop in "i16_factor_2"
-; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_8-LABEL: Checking a loop in "i16_factor_2"
+; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
@@ -74,16 +79,26 @@ define void @i32_factor_2(%i32.2* %data, i64 %n) {
entry:
br label %for.body
-; VF_2-LABEL: Checking a loop in "i32_factor_2"
-; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_4-LABEL: Checking a loop in "i32_factor_2"
-; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_2-LABEL: Checking a loop in "i32_factor_2"
+; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_4-LABEL: Checking a loop in "i32_factor_2"
+; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_8-LABEL: Checking a loop in "i32_factor_2"
+; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
@@ -105,11 +120,26 @@ define void @i64_factor_2(%i64.2* %data, i64 %n) {
entry:
br label %for.body
-; VF_2-LABEL: Checking a loop in "i64_factor_2"
-; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
-; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_2-LABEL: Checking a loop in "i64_factor_2"
+; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_4-LABEL: Checking a loop in "i64_factor_2"
+; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_4-NEXT: Found an estimated cost of 4 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_8-LABEL: Checking a loop in "i64_factor_2"
+; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_16-LABEL: Checking a loop in "i64_factor_2"
+; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
index 685c523d495..c85dc63c31a 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
@@ -43,16 +43,21 @@ define void @i16_factor_2(%i16.2* %data, i64 %n) {
entry:
br label %for.body
-; VF_4-LABEL: Checking a loop in "i16_factor_2"
-; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_8-LABEL: Checking a loop in "i16_factor_2"
-; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_4-LABEL: Checking a loop in "i16_factor_2"
+; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_8-LABEL: Checking a loop in "i16_factor_2"
+; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
@@ -74,16 +79,26 @@ define void @i32_factor_2(%i32.2* %data, i64 %n) {
entry:
br label %for.body
-; VF_2-LABEL: Checking a loop in "i32_factor_2"
-; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_4-LABEL: Checking a loop in "i32_factor_2"
-; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_2-LABEL: Checking a loop in "i32_factor_2"
+; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_4-LABEL: Checking a loop in "i32_factor_2"
+; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_8-LABEL: Checking a loop in "i32_factor_2"
+; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
OpenPOWER on IntegriCloud