diff options
author | Eli Friedman <efriedma@codeaurora.org> | 2016-12-16 18:44:08 +0000 |
---|---|---|
committer | Eli Friedman <efriedma@codeaurora.org> | 2016-12-16 18:44:08 +0000 |
commit | f624ec27b75701f1227eb03a44f3da84920c798d (patch) | |
tree | d10b422365e2b461b34e1a53f1f29a72cb5b5b00 /llvm/test/CodeGen/ARM/vmul.ll | |
parent | 79b4f0ad9cc5b019cf71bac388f1da1de4dd4e34 (diff) | |
download | bcm5719-llvm-f624ec27b75701f1227eb03a44f3da84920c798d.tar.gz bcm5719-llvm-f624ec27b75701f1227eb03a44f3da84920c798d.zip |
[ARM] Add ARMISD::VLD1DUP to match vld1_dup more consistently.
Currently, there are substantial problems forming vld1_dup even if the
VDUP survives legalization. The lack of an actual node
leads to terrible results: not only can we not form post-increment vld1_dup
instructions, but we form scalar pre-increment and post-increment
loads which force the loaded value into a GPR. This patch fixes that
by combining the vdup+load into an ARMISD node before DAGCombine
messes it up.
Also includes a crash fix for vld2_dup (see testcase @vld2dupi8_postinc_variable).
Recommiting with fix to avoid forming vld1dup if the type of the load
doesn't match the type of the vdup (see
https://llvm.org/bugs/show_bug.cgi?id=31404).
Differential Revision: https://reviews.llvm.org/D27694
llvm-svn: 289972
Diffstat (limited to 'llvm/test/CodeGen/ARM/vmul.ll')
-rw-r--r-- | llvm/test/CodeGen/ARM/vmul.ll | 25 |
1 files changed, 19 insertions, 6 deletions
diff --git a/llvm/test/CodeGen/ARM/vmul.ll b/llvm/test/CodeGen/ARM/vmul.ll index 0455190b4c9..fcffe175e2b 100644 --- a/llvm/test/CodeGen/ARM/vmul.ll +++ b/llvm/test/CodeGen/ARM/vmul.ll @@ -635,13 +635,26 @@ entry: ret void } -define void @foo(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind { -; Look for doing a normal scalar FP load rather than an to-all-lanes load. -; e.g., "ldr s0, [r2]" rathern than "vld1.32 {d18[], d19[]}, [r2:32]" -; Then check that the vector multiply has folded the splat to all lanes -; and used a vector * scalar instruction. -; CHECK: vldr {{s[0-9]+}}, [r2] +define void @fmul_splat(<4 x float> * %a, <4 x float>* nocapture %dst, float %tmp) nounwind { +; Look for a scalar float rather than a splat, then a vector*scalar multiply. +; CHECK: vmov s0, r2 ; CHECK: vmul.f32 q8, q8, d0[0] + %tmp5 = load <4 x float>, <4 x float>* %a, align 4 + %tmp6 = insertelement <4 x float> undef, float %tmp, i32 0 + %tmp7 = insertelement <4 x float> %tmp6, float %tmp, i32 1 + %tmp8 = insertelement <4 x float> %tmp7, float %tmp, i32 2 + %tmp9 = insertelement <4 x float> %tmp8, float %tmp, i32 3 + %tmp10 = fmul <4 x float> %tmp9, %tmp5 + store <4 x float> %tmp10, <4 x float>* %dst, align 4 + ret void +} + +define void @fmul_splat_load(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind { +; Look for doing a normal scalar FP load rather than an to-all-lanes load, +; then a vector*scalar multiply. +; FIXME: Temporarily broken due to splat representation changes. +; CHECK: vld1.32 {d18[], d19[]}, [r2:32] +; CHECK: vmul.f32 q8, q9, q8 %tmp = load float, float* %src, align 4 %tmp5 = load <4 x float>, <4 x float>* %a, align 4 %tmp6 = insertelement <4 x float> undef, float %tmp, i32 0 |