summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPetr Pavlu <petr.pavlu@arm.com>2018-07-30 08:49:30 +0000
committerPetr Pavlu <petr.pavlu@arm.com>2018-07-30 08:49:30 +0000
commit8b6eff4e77f07f2afd738a97a821b5d292b9fdf1 (patch)
treec789ca421673f7e9f066fce464f039587c6c4d31
parent6a5c95bd6643a78b5bc1df7397444138956ffbc0 (diff)
downloadbcm5719-llvm-8b6eff4e77f07f2afd738a97a821b5d292b9fdf1.tar.gz
bcm5719-llvm-8b6eff4e77f07f2afd738a97a821b5d292b9fdf1.zip
[ARM] Fix over-alignment in arguments that are HA of 128-bit vectors
Code in `CC_ARM_AAPCS_Custom_Aggregate()` is responsible for handling homogeneous aggregates for `CC_ARM_AAPCS_VFP`. When an aggregate ends up fully on stack, the function tries to pack all resulting items of the aggregate as tightly as possible according to AAPCS. Once the first item was laid out, the alignment used for consecutive items was the size of one item. This logic went wrong for 128-bit vectors because their alignment is normally only 64 bits, and so could result in inserting unexpected padding between the first and second element. The patch fixes the problem by updating the alignment with the item size only if this results in reducing it. Differential Revision: https://reviews.llvm.org/D49720 llvm-svn: 338233
-rw-r--r--llvm/lib/Target/ARM/ARMCallingConv.h11
-rw-r--r--llvm/test/CodeGen/ARM/aggregate-padding.ll16
2 files changed, 22 insertions, 5 deletions
diff --git a/llvm/lib/Target/ARM/ARMCallingConv.h b/llvm/lib/Target/ARM/ARMCallingConv.h
index 63bf48abb7a..543165de38d 100644
--- a/llvm/lib/Target/ARM/ARMCallingConv.h
+++ b/llvm/lib/Target/ARM/ARMCallingConv.h
@@ -269,14 +269,15 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
for (auto Reg : RegList)
State.AllocateReg(Reg);
+ // After the first item has been allocated, the rest are packed as tightly as
+ // possible. (E.g. an incoming i64 would have starting Align of 8, but we'll
+ // be allocating a bunch of i32 slots).
+ unsigned RestAlign = std::min(Align, Size);
+
for (auto &It : PendingMembers) {
It.convertToMem(State.AllocateStack(Size, Align));
State.addLoc(It);
-
- // After the first item has been allocated, the rest are packed as tightly
- // as possible. (E.g. an incoming i64 would have starting Align of 8, but
- // we'll be allocating a bunch of i32 slots).
- Align = Size;
+ Align = RestAlign;
}
// All pending members have now been allocated
diff --git a/llvm/test/CodeGen/ARM/aggregate-padding.ll b/llvm/test/CodeGen/ARM/aggregate-padding.ll
index bc46a9cdf91..ae7ab90fcd2 100644
--- a/llvm/test/CodeGen/ARM/aggregate-padding.ll
+++ b/llvm/test/CodeGen/ARM/aggregate-padding.ll
@@ -99,3 +99,19 @@ define i16 @test_i16_forced_stack([8 x double], double, i32, i32, [3 x i16] %arg
%sum = add i16 %val0, %val2
ret i16 %sum
}
+
+; [2 x <4 x i32>] should be aligned only on a 64-bit boundary and contiguous.
+; None of the two <4 x i32> elements should introduce any padding to 128 bits.
+define i32 @test_4xi32_64bit_aligned_and_contiguous([8 x double], float, [2 x <4 x i32>] %arg) nounwind {
+; CHECK-LABEL: test_4xi32_64bit_aligned_and_contiguous:
+; CHECK-DAG: ldr [[VAL0_0:r[0-9]+]], [sp, #8]
+; CHECK-DAG: ldr [[VAL1_0:r[0-9]+]], [sp, #24]
+; CHECK: add r0, [[VAL0_0]], [[VAL1_0]]
+
+ %val0 = extractvalue [2 x <4 x i32>] %arg, 0
+ %val0_0 = extractelement <4 x i32> %val0, i32 0
+ %val1 = extractvalue [2 x <4 x i32>] %arg, 1
+ %val1_0 = extractelement <4 x i32> %val1, i32 0
+ %sum = add i32 %val0_0, %val1_0
+ ret i32 %sum
+}
OpenPOWER on IntegriCloud