From cee313d288a4faf0355d76fb6e0e927e211d08a5 Mon Sep 17 00:00:00 2001 From: Eric Christopher Date: Wed, 17 Apr 2019 04:52:47 +0000 Subject: Revert "Temporarily Revert "Add basic loop fusion pass."" The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552 --- .../Transforms/LoopVectorize/12-12-11-if-conv.ll | 39 + .../Transforms/LoopVectorize/2012-10-20-infloop.ll | 71 + .../LoopVectorize/2012-10-22-isconsec.ll | 53 + .../LoopVectorize/2016-07-27-loop-vec.ll | 19 + .../LoopVectorize/AArch64/aarch64-predication.ll | 79 + .../LoopVectorize/AArch64/aarch64-unroll.ll | 42 + .../AArch64/arbitrary-induction-step.ll | 147 + .../LoopVectorize/AArch64/arm64-unroll.ll | 42 + .../LoopVectorize/AArch64/backedge-overflow.ll | 166 + .../AArch64/deterministic-type-shrinkage.ll | 54 + .../LoopVectorize/AArch64/gather-cost.ll | 85 + .../LoopVectorize/AArch64/induction-trunc.ll | 30 + .../LoopVectorize/AArch64/interleaved-vs-scalar.ll | 37 + .../LoopVectorize/AArch64/interleaved_cost.ll | 189 ++ .../Transforms/LoopVectorize/AArch64/lit.local.cfg | 5 + .../AArch64/loop-vectorization-factors.ll | 310 ++ .../AArch64/max-vf-for-interleaved.ll | 56 + .../AArch64/no_vector_instructions.ll | 49 + .../outer_loop_test1_no_explicit_vect_width.ll | 144 + .../Transforms/LoopVectorize/AArch64/pr31900.ll | 37 + .../Transforms/LoopVectorize/AArch64/pr33053.ll | 56 + .../Transforms/LoopVectorize/AArch64/pr36032.ll | 153 + .../LoopVectorize/AArch64/predication_costs.ll | 231 ++ .../LoopVectorize/AArch64/reduction-small-size.ll | 171 + .../Transforms/LoopVectorize/AArch64/sdiv-pow2.ll | 31 + .../AArch64/smallest-and-widest-types.ll | 33 + .../AArch64/type-shrinkage-insertelt.ll | 47 + .../AMDGPU/divergent-runtime-check.ll | 29 + .../Transforms/LoopVectorize/AMDGPU/lit.local.cfg | 2 + .../Transforms/LoopVectorize/AMDGPU/packed-math.ll | 34 + .../AMDGPU/unroll-in-loop-vectorizer.ll | 28 + .../LoopVectorize/ARM/arm-ieee-vectorize.ll | 330 ++ .../Transforms/LoopVectorize/ARM/arm-unroll.ll | 71 + .../Transforms/LoopVectorize/ARM/gather-cost.ll | 88 + .../Transforms/LoopVectorize/ARM/gcc-examples.ll | 60 + .../LoopVectorize/ARM/interleaved_cost.ll | 147 + .../Transforms/LoopVectorize/ARM/lit.local.cfg | 3 + .../Transforms/LoopVectorize/ARM/mul-cast-vect.ll | 114 + llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll | 165 + .../Transforms/LoopVectorize/ARM/vector_cast.ll | 37 + .../Transforms/LoopVectorize/ARM/width-detect.ll | 52 + .../Transforms/LoopVectorize/Hexagon/lit.local.cfg | 2 + .../Transforms/LoopVectorize/Hexagon/minimum-vf.ll | 173 + .../LoopVectorize/PowerPC/agg-interleave-a2.ll | 40 + .../LoopVectorize/PowerPC/large-loop-rdx.ll | 75 + .../Transforms/LoopVectorize/PowerPC/lit.local.cfg | 3 + .../Transforms/LoopVectorize/PowerPC/pr30990.ll | 140 + .../LoopVectorize/PowerPC/small-loop-rdx.ll | 49 + .../LoopVectorize/PowerPC/stride-vectorization.ll | 36 + .../PowerPC/vectorize-only-for-real.ll | 62 + .../LoopVectorize/PowerPC/vsx-tsvc-s173.ll | 51 + .../Transforms/LoopVectorize/SystemZ/addressing.ll | 72 + .../SystemZ/branch-for-predicated-block.ll | 38 + .../Transforms/LoopVectorize/SystemZ/lit.local.cfg | 2 + .../SystemZ/load-scalarization-cost-0.ll | 27 + .../SystemZ/load-scalarization-cost-1.ll | 28 + .../SystemZ/load-store-scalarization-cost.ll | 33 + .../SystemZ/mem-interleaving-costs-02.ll | 149 + .../SystemZ/mem-interleaving-costs.ll | 70 + .../Transforms/LoopVectorize/SystemZ/pr38110.ll | 50 + .../LoopVectorize/X86/already-vectorized.ll | 46 + llvm/test/Transforms/LoopVectorize/X86/assume.ll | 100 + llvm/test/Transforms/LoopVectorize/X86/avx1.ll | 52 + llvm/test/Transforms/LoopVectorize/X86/avx512.ll | 112 + .../LoopVectorize/X86/consecutive-ptr-cg-bug.ll | 108 + .../LoopVectorize/X86/consecutive-ptr-uniforms.ll | 67 + .../Transforms/LoopVectorize/X86/constant-fold.ll | 53 + .../LoopVectorize/X86/constant-vector-operand.ll | 30 + .../LoopVectorize/X86/conversion-cost.ll | 47 + .../Transforms/LoopVectorize/X86/cost-model.ll | 82 + .../LoopVectorize/X86/float-induction-x86.ll | 149 + .../Transforms/LoopVectorize/X86/force-ifcvt.ll | 42 + .../LoopVectorize/X86/fp32_to_uint32-cost-model.ll | 39 + .../LoopVectorize/X86/fp64_to_uint32-cost-model.ll | 40 + .../LoopVectorize/X86/fp_to_sint8-cost-model.ll | 25 + llvm/test/Transforms/LoopVectorize/X86/funclet.ll | 45 + .../Transforms/LoopVectorize/X86/gather-cost.ll | 86 + .../LoopVectorize/X86/gather-vs-interleave.ll | 41 + .../Transforms/LoopVectorize/X86/gather_scatter.ll | 1754 ++++++++++ .../Transforms/LoopVectorize/X86/gcc-examples.ll | 77 + .../X86/illegal-parallel-loop-uniform-write.ll | 240 ++ .../LoopVectorize/X86/imprecise-through-phis.ll | 177 + .../LoopVectorize/X86/int128_no_gather.ll | 76 + .../X86/interleaved-accesses-large-gap.ll | 40 + .../Transforms/LoopVectorize/X86/interleaving.ll | 36 + .../LoopVectorize/X86/invariant-load-gather.ll | 93 + .../X86/invariant-store-vectorization.ll | 237 ++ .../Transforms/LoopVectorize/X86/lit.local.cfg | 3 + .../LoopVectorize/X86/masked_load_store.ll | 3374 ++++++++++++++++++++ .../Transforms/LoopVectorize/X86/max-mstore.ll | 46 + .../LoopVectorize/X86/metadata-enable.ll | 2473 ++++++++++++++ .../LoopVectorize/X86/min-trip-count-switch.ll | 24 + .../Transforms/LoopVectorize/X86/mul_slm_16bit.ll | 145 + .../test/Transforms/LoopVectorize/X86/no-vector.ll | 22 + .../test/Transforms/LoopVectorize/X86/no_fpmath.ll | 109 + .../LoopVectorize/X86/no_fpmath_with_hotness.ll | 113 + llvm/test/Transforms/LoopVectorize/X86/optsize.ll | 198 ++ .../X86/outer_loop_test1_no_explicit_vect_width.ll | 114 + .../X86/parallel-loops-after-reg2mem.ll | 50 + .../Transforms/LoopVectorize/X86/parallel-loops.ll | 115 + .../test/Transforms/LoopVectorize/X86/powof2div.ll | 32 + llvm/test/Transforms/LoopVectorize/X86/pr23997.ll | 109 + llvm/test/Transforms/LoopVectorize/X86/pr34438.ll | 36 + llvm/test/Transforms/LoopVectorize/X86/pr35432.ll | 213 ++ llvm/test/Transforms/LoopVectorize/X86/pr36524.ll | 39 + llvm/test/Transforms/LoopVectorize/X86/pr39160.ll | 98 + .../LoopVectorize/X86/propagate-metadata.ll | 25 + .../LoopVectorize/X86/ptr-indvar-crash.ll | 20 + llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll | 33 + .../LoopVectorize/X86/reduction-crash.ll | 35 + .../LoopVectorize/X86/reduction-fastmath.ll | 112 + .../LoopVectorize/X86/reduction-small-size.ll | 80 + .../LoopVectorize/X86/redundant-vf2-cost.ll | 34 + .../LoopVectorize/X86/reg-usage-debug.ll | 134 + .../test/Transforms/LoopVectorize/X86/reg-usage.ll | 135 + .../LoopVectorize/X86/register-assumption.ll | 32 + .../Transforms/LoopVectorize/X86/scatter_crash.ll | 114 + .../LoopVectorize/X86/slm-no-vectorize.ll | 49 + .../Transforms/LoopVectorize/X86/small-size.ll | 408 +++ .../LoopVectorize/X86/strided_load_cost.ll | 54 + .../Transforms/LoopVectorize/X86/struct-store.ll | 27 + .../LoopVectorize/X86/svml-calls-finite.ll | 187 ++ .../Transforms/LoopVectorize/X86/svml-calls.ll | 501 +++ .../test/Transforms/LoopVectorize/X86/tripcount.ll | 39 + .../LoopVectorize/X86/uint64_to_fp64-cost-model.ll | 27 + .../Transforms/LoopVectorize/X86/uniform-phi.ll | 99 + .../Transforms/LoopVectorize/X86/uniform_load.ll | 47 + .../Transforms/LoopVectorize/X86/uniformshift.ll | 23 + .../test/Transforms/LoopVectorize/X86/unroll-pm.ll | 31 + .../LoopVectorize/X86/unroll-small-loops.ll | 102 + .../LoopVectorize/X86/unroll_selection.ll | 71 + .../Transforms/LoopVectorize/X86/veclib-calls.ll | 632 ++++ .../Transforms/LoopVectorize/X86/vect.omp.force.ll | 87 + .../LoopVectorize/X86/vect.omp.force.small-tc.ll | 217 ++ .../LoopVectorize/X86/vector-scalar-select-cost.ll | 66 + .../LoopVectorize/X86/vector_max_bandwidth.ll | 75 + .../LoopVectorize/X86/vector_ptr_load_store.ll | 150 + .../X86/vectorization-remarks-loopid-dbg.ll | 74 + .../X86/vectorization-remarks-missed.ll | 313 ++ .../X86/vectorization-remarks-profitable.ll | 112 + .../LoopVectorize/X86/vectorization-remarks.ll | 73 + .../LoopVectorize/X86/vectorize-only-for-real.ll | 39 + .../X86/x86-interleaved-accesses-masked-group.ll | 826 +++++ .../Transforms/LoopVectorize/X86/x86-pr39099.ll | 60 + .../LoopVectorize/X86/x86-predication.ll | 98 + .../LoopVectorize/X86/x86_fp80-vector-store.ll | 29 + .../Transforms/LoopVectorize/XCore/lit.local.cfg | 2 + .../LoopVectorize/XCore/no-vector-registers.ll | 23 + llvm/test/Transforms/LoopVectorize/align.ll | 32 + llvm/test/Transforms/LoopVectorize/bsd_regex.ll | 38 + .../Transforms/LoopVectorize/bzip_reverse_loops.ll | 65 + llvm/test/Transforms/LoopVectorize/calloc.ll | 49 + .../Transforms/LoopVectorize/cast-induction.ll | 29 + .../LoopVectorize/conditional-assignment.ll | 57 + .../test/Transforms/LoopVectorize/consec_no_gep.ll | 42 + .../LoopVectorize/consecutive-ptr-uniforms.ll | 490 +++ llvm/test/Transforms/LoopVectorize/control-flow.ll | 77 + .../test/Transforms/LoopVectorize/cpp-new-array.ll | 45 + llvm/test/Transforms/LoopVectorize/dbg.value.ll | 77 + .../Transforms/LoopVectorize/dead_instructions.ll | 42 + llvm/test/Transforms/LoopVectorize/debugloc.ll | 89 + .../demanded-bits-of-pointer-instruction.ll | 20 + .../LoopVectorize/diag-missing-instr-debug-loc.ll | 77 + .../LoopVectorize/diag-with-hotness-info-2.ll | 200 ++ .../LoopVectorize/diag-with-hotness-info.ll | 213 ++ .../Transforms/LoopVectorize/disable_nonforced.ll | 29 + .../LoopVectorize/disable_nonforced_enable.ll | 29 + .../test/Transforms/LoopVectorize/discriminator.ll | 76 + llvm/test/Transforms/LoopVectorize/ee-crash.ll | 34 + llvm/test/Transforms/LoopVectorize/exact.ll | 23 + .../LoopVectorize/explicit_outer_detection.ll | 236 ++ .../explicit_outer_nonuniform_inner.ll | 177 + .../explicit_outer_uniform_diverg_branch.ll | 138 + .../Transforms/LoopVectorize/fcmp-vectorize.ll | 25 + .../LoopVectorize/first-order-recurrence.ll | 574 ++++ llvm/test/Transforms/LoopVectorize/flags.ll | 78 + .../Transforms/LoopVectorize/float-induction.ll | 340 ++ .../Transforms/LoopVectorize/float-reduction.ll | 46 + llvm/test/Transforms/LoopVectorize/followup.ll | 43 + llvm/test/Transforms/LoopVectorize/funcall.ll | 32 + llvm/test/Transforms/LoopVectorize/gcc-examples.ll | 685 ++++ .../Transforms/LoopVectorize/gep_with_bitcast.ll | 41 + llvm/test/Transforms/LoopVectorize/global_alias.ll | 1077 +++++++ llvm/test/Transforms/LoopVectorize/hints-trans.ll | 29 + llvm/test/Transforms/LoopVectorize/hoist-loads.ll | 70 + llvm/test/Transforms/LoopVectorize/i8-induction.ll | 40 + .../test/Transforms/LoopVectorize/icmp-uniforms.ll | 35 + .../test/Transforms/LoopVectorize/if-conv-crash.ll | 60 + .../LoopVectorize/if-conversion-edgemasks.ll | 245 ++ .../Transforms/LoopVectorize/if-conversion-nest.ll | 118 + .../LoopVectorize/if-conversion-reduction.ll | 37 + .../test/Transforms/LoopVectorize/if-conversion.ll | 197 ++ .../Transforms/LoopVectorize/if-pred-non-void.ll | 277 ++ .../LoopVectorize/if-pred-not-when-safe.ll | 89 + .../Transforms/LoopVectorize/if-pred-stores.ll | 178 ++ llvm/test/Transforms/LoopVectorize/if-reduction.ll | 821 +++++ .../Transforms/LoopVectorize/incorrect-dom-info.ll | 142 + llvm/test/Transforms/LoopVectorize/increment.ll | 65 + .../Transforms/LoopVectorize/induction-step.ll | 201 ++ llvm/test/Transforms/LoopVectorize/induction.ll | 896 ++++++ .../Transforms/LoopVectorize/induction_plus.ll | 34 + llvm/test/Transforms/LoopVectorize/infiniteloop.ll | 34 + .../Transforms/LoopVectorize/int_sideeffect.ll | 24 + .../LoopVectorize/interleaved-accesses-1.ll | 78 + .../LoopVectorize/interleaved-accesses-2.ll | 58 + .../LoopVectorize/interleaved-accesses-3.ll | 57 + .../LoopVectorize/interleaved-accesses-alias.ll | 63 + .../interleaved-accesses-masked-group.ll | 222 ++ .../interleaved-accesses-pred-stores.ll | 165 + .../LoopVectorize/interleaved-accesses.ll | 921 ++++++ .../interleaved-acess-with-remarks.ll | 43 + llvm/test/Transforms/LoopVectorize/intrinsic.ll | 1357 ++++++++ .../LoopVectorize/invariant-store-vectorization.ll | 593 ++++ .../Transforms/LoopVectorize/iv_outside_user.ll | 176 + llvm/test/Transforms/LoopVectorize/lcssa-crash.ll | 62 + .../LoopVectorize/legal_preheader_check.ll | 27 + .../Transforms/LoopVectorize/libcall-remark.ll | 52 + llvm/test/Transforms/LoopVectorize/lifetime.ll | 96 + llvm/test/Transforms/LoopVectorize/loop-form.ll | 31 + llvm/test/Transforms/LoopVectorize/loop-scalars.ll | 143 + .../Transforms/LoopVectorize/loop-vect-memdep.ll | 26 + llvm/test/Transforms/LoopVectorize/memdep.ll | 273 ++ .../Transforms/LoopVectorize/metadata-unroll.ll | 40 + .../Transforms/LoopVectorize/metadata-width.ll | 30 + llvm/test/Transforms/LoopVectorize/metadata.ll | 43 + .../Transforms/LoopVectorize/middle-block-dbg.ll | 110 + llvm/test/Transforms/LoopVectorize/miniters.ll | 44 + .../Transforms/LoopVectorize/minmax_reduction.ll | 885 +++++ .../LoopVectorize/multi-use-reduction-bug.ll | 41 + .../LoopVectorize/multiple-address-spaces.ll | 43 + .../multiple-strides-vectorization.ll | 64 + .../LoopVectorize/no-interleave-up-front.ll | 35 + .../Transforms/LoopVectorize/no_array_bounds.ll | 100 + .../Transforms/LoopVectorize/no_idiv_reduction.ll | 24 + .../Transforms/LoopVectorize/no_int_induction.ll | 60 + .../Transforms/LoopVectorize/no_outside_user.ll | 414 +++ llvm/test/Transforms/LoopVectorize/no_switch.ll | 93 + .../no_switch_disable_vectorization.ll | 95 + .../Transforms/LoopVectorize/noalias-md-licm.ll | 59 + llvm/test/Transforms/LoopVectorize/noalias-md.ll | 78 + llvm/test/Transforms/LoopVectorize/nofloat.ll | 28 + llvm/test/Transforms/LoopVectorize/non-const-n.ll | 37 + llvm/test/Transforms/LoopVectorize/nontemporal.ll | 46 + llvm/test/Transforms/LoopVectorize/nsw-crash.ll | 24 + llvm/test/Transforms/LoopVectorize/opt.ll | 27 + llvm/test/Transforms/LoopVectorize/optsize.ll | 102 + .../Transforms/LoopVectorize/outer_loop_test1.ll | 82 + .../Transforms/LoopVectorize/outer_loop_test2.ll | 112 + .../test/Transforms/LoopVectorize/partial-lcssa.ll | 54 + llvm/test/Transforms/LoopVectorize/phi-cost.ll | 86 + llvm/test/Transforms/LoopVectorize/phi-hang.ll | 47 + llvm/test/Transforms/LoopVectorize/pr25281.ll | 58 + llvm/test/Transforms/LoopVectorize/pr28541.ll | 71 + .../LoopVectorize/pr30654-phiscev-sext-trunc.ll | 241 ++ .../Transforms/LoopVectorize/pr30806-phi-scev.ll | 66 + llvm/test/Transforms/LoopVectorize/pr30806.ll | 65 + llvm/test/Transforms/LoopVectorize/pr31098.ll | 100 + llvm/test/Transforms/LoopVectorize/pr31190.ll | 63 + llvm/test/Transforms/LoopVectorize/pr32859.ll | 30 + llvm/test/Transforms/LoopVectorize/pr33706.ll | 61 + llvm/test/Transforms/LoopVectorize/pr34681.ll | 122 + llvm/test/Transforms/LoopVectorize/pr35743.ll | 102 + llvm/test/Transforms/LoopVectorize/pr35773.ll | 53 + llvm/test/Transforms/LoopVectorize/pr36311.ll | 49 + llvm/test/Transforms/LoopVectorize/pr36983.ll | 24 + llvm/test/Transforms/LoopVectorize/pr37248.ll | 42 + llvm/test/Transforms/LoopVectorize/pr37515.ll | 20 + llvm/test/Transforms/LoopVectorize/pr38800.ll | 34 + llvm/test/Transforms/LoopVectorize/pr39099.ll | 42 + .../LoopVectorize/pr39417-optsize-scevchecks.ll | 54 + .../preserve-dbg-loc-and-loop-metadata.ll | 38 + .../test/Transforms/LoopVectorize/ptr-induction.ll | 34 + llvm/test/Transforms/LoopVectorize/ptr_loops.ll | 73 + llvm/test/Transforms/LoopVectorize/read-only.ll | 31 + .../LoopVectorize/reduction-small-size.ll | 73 + llvm/test/Transforms/LoopVectorize/reduction.ll | 580 ++++ .../Transforms/LoopVectorize/remove_metadata.ll | 32 + .../Transforms/LoopVectorize/reverse_induction.ll | 152 + llvm/test/Transforms/LoopVectorize/reverse_iter.ll | 45 + .../LoopVectorize/runtime-check-address-space.ll | 221 ++ .../runtime-check-readonly-address-space.ll | 132 + .../LoopVectorize/runtime-check-readonly.ll | 37 + .../test/Transforms/LoopVectorize/runtime-check.ll | 179 ++ .../test/Transforms/LoopVectorize/runtime-limit.ll | 101 + llvm/test/Transforms/LoopVectorize/safegep.ll | 61 + .../Transforms/LoopVectorize/same-base-access.ll | 107 + .../test/Transforms/LoopVectorize/scalar-select.ll | 36 + .../LoopVectorize/scalar_after_vectorization.ll | 74 + .../Transforms/LoopVectorize/scev-exitlim-crash.ll | 113 + .../test/Transforms/LoopVectorize/simple-unroll.ll | 38 + .../Transforms/LoopVectorize/skip-iterations.ll | 181 ++ llvm/test/Transforms/LoopVectorize/small-loop.ll | 57 + .../Transforms/LoopVectorize/start-non-zero.ll | 30 + .../Transforms/LoopVectorize/store-shuffle-bug.ll | 49 + .../test/Transforms/LoopVectorize/struct_access.ll | 87 + llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll | 101 + llvm/test/Transforms/LoopVectorize/tripcount.ll | 211 ++ .../Transforms/LoopVectorize/undef-inst-bug.ll | 36 + .../unroll-novec-memcheck-metadata.ll | 36 + llvm/test/Transforms/LoopVectorize/unroll.ll | 37 + llvm/test/Transforms/LoopVectorize/unroll_novec.ll | 48 + .../Transforms/LoopVectorize/unsafe-dep-remark.ll | 73 + .../LoopVectorize/unsized-pointee-crash.ll | 23 + .../test/Transforms/LoopVectorize/value-ptr-bug.ll | 50 + .../LoopVectorize/vect-phiscev-sext-trunc.ll | 211 ++ .../LoopVectorize/vect.omp.persistence.ll | 36 + llvm/test/Transforms/LoopVectorize/vect.stats.ll | 58 + llvm/test/Transforms/LoopVectorize/vector-geps.ll | 61 + .../Transforms/LoopVectorize/vectorize-once.ll | 76 + .../Transforms/LoopVectorize/version-mem-access.ll | 94 + .../vplan-stress-test-no-explict-vf.ll | 45 + .../LoopVectorize/vplan_hcfg_stress_test.ll | 51 + llvm/test/Transforms/LoopVectorize/write-only.ll | 25 + .../LoopVectorize/zero-sized-pointee-crash.ll | 26 + 314 files changed, 42557 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll create mode 100644 llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll create mode 100644 llvm/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll create mode 100644 llvm/test/Transforms/LoopVectorize/2016-07-27-loop-vec.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/lit.local.cfg create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/max-vf-for-interleaved.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/pr31900.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sdiv-pow2.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AMDGPU/divergent-runtime-check.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg create mode 100644 llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll create mode 100644 llvm/test/Transforms/LoopVectorize/Hexagon/lit.local.cfg create mode 100644 llvm/test/Transforms/LoopVectorize/Hexagon/minimum-vf.ll create mode 100644 llvm/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll create mode 100644 llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll create mode 100644 llvm/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg create mode 100644 llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll create mode 100644 llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll create mode 100644 llvm/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll create mode 100644 llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll create mode 100644 llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll create mode 100644 llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll create mode 100644 llvm/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll create mode 100644 llvm/test/Transforms/LoopVectorize/SystemZ/lit.local.cfg create mode 100644 llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll create mode 100644 llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll create mode 100644 llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll create mode 100644 llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll create mode 100644 llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll create mode 100644 llvm/test/Transforms/LoopVectorize/SystemZ/pr38110.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/assume.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/avx1.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/avx512.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/cost-model.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/funclet.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/gcc-examples.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/interleaving.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/lit.local.cfg create mode 100644 llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/max-mstore.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/no-vector.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/optsize.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/powof2div.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/pr23997.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/pr34438.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/pr35432.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/pr36524.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/pr39160.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/propagate-metadata.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/ptr-indvar-crash.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/register-assumption.ll create mode 100755 llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/small-size.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/struct-store.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/tripcount.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/unroll_selection.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll create mode 100644 llvm/test/Transforms/LoopVectorize/XCore/lit.local.cfg create mode 100644 llvm/test/Transforms/LoopVectorize/XCore/no-vector-registers.ll create mode 100644 llvm/test/Transforms/LoopVectorize/align.ll create mode 100644 llvm/test/Transforms/LoopVectorize/bsd_regex.ll create mode 100644 llvm/test/Transforms/LoopVectorize/bzip_reverse_loops.ll create mode 100644 llvm/test/Transforms/LoopVectorize/calloc.ll create mode 100644 llvm/test/Transforms/LoopVectorize/cast-induction.ll create mode 100644 llvm/test/Transforms/LoopVectorize/conditional-assignment.ll create mode 100644 llvm/test/Transforms/LoopVectorize/consec_no_gep.ll create mode 100644 llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll create mode 100644 llvm/test/Transforms/LoopVectorize/control-flow.ll create mode 100644 llvm/test/Transforms/LoopVectorize/cpp-new-array.ll create mode 100644 llvm/test/Transforms/LoopVectorize/dbg.value.ll create mode 100644 llvm/test/Transforms/LoopVectorize/dead_instructions.ll create mode 100644 llvm/test/Transforms/LoopVectorize/debugloc.ll create mode 100644 llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll create mode 100644 llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll create mode 100644 llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll create mode 100644 llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll create mode 100644 llvm/test/Transforms/LoopVectorize/disable_nonforced.ll create mode 100644 llvm/test/Transforms/LoopVectorize/disable_nonforced_enable.ll create mode 100644 llvm/test/Transforms/LoopVectorize/discriminator.ll create mode 100644 llvm/test/Transforms/LoopVectorize/ee-crash.ll create mode 100644 llvm/test/Transforms/LoopVectorize/exact.ll create mode 100644 llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll create mode 100644 llvm/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll create mode 100644 llvm/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll create mode 100644 llvm/test/Transforms/LoopVectorize/fcmp-vectorize.ll create mode 100644 llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll create mode 100644 llvm/test/Transforms/LoopVectorize/flags.ll create mode 100644 llvm/test/Transforms/LoopVectorize/float-induction.ll create mode 100644 llvm/test/Transforms/LoopVectorize/float-reduction.ll create mode 100644 llvm/test/Transforms/LoopVectorize/followup.ll create mode 100644 llvm/test/Transforms/LoopVectorize/funcall.ll create mode 100644 llvm/test/Transforms/LoopVectorize/gcc-examples.ll create mode 100644 llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll create mode 100644 llvm/test/Transforms/LoopVectorize/global_alias.ll create mode 100644 llvm/test/Transforms/LoopVectorize/hints-trans.ll create mode 100644 llvm/test/Transforms/LoopVectorize/hoist-loads.ll create mode 100644 llvm/test/Transforms/LoopVectorize/i8-induction.ll create mode 100644 llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll create mode 100644 llvm/test/Transforms/LoopVectorize/if-conv-crash.ll create mode 100644 llvm/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll create mode 100644 llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll create mode 100644 llvm/test/Transforms/LoopVectorize/if-conversion-reduction.ll create mode 100644 llvm/test/Transforms/LoopVectorize/if-conversion.ll create mode 100644 llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll create mode 100644 llvm/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll create mode 100644 llvm/test/Transforms/LoopVectorize/if-pred-stores.ll create mode 100644 llvm/test/Transforms/LoopVectorize/if-reduction.ll create mode 100644 llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll create mode 100644 llvm/test/Transforms/LoopVectorize/increment.ll create mode 100644 llvm/test/Transforms/LoopVectorize/induction-step.ll create mode 100644 llvm/test/Transforms/LoopVectorize/induction.ll create mode 100644 llvm/test/Transforms/LoopVectorize/induction_plus.ll create mode 100644 llvm/test/Transforms/LoopVectorize/infiniteloop.ll create mode 100644 llvm/test/Transforms/LoopVectorize/int_sideeffect.ll create mode 100644 llvm/test/Transforms/LoopVectorize/interleaved-accesses-1.ll create mode 100644 llvm/test/Transforms/LoopVectorize/interleaved-accesses-2.ll create mode 100644 llvm/test/Transforms/LoopVectorize/interleaved-accesses-3.ll create mode 100644 llvm/test/Transforms/LoopVectorize/interleaved-accesses-alias.ll create mode 100644 llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll create mode 100644 llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll create mode 100644 llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll create mode 100644 llvm/test/Transforms/LoopVectorize/interleaved-acess-with-remarks.ll create mode 100644 llvm/test/Transforms/LoopVectorize/intrinsic.ll create mode 100644 llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll create mode 100644 llvm/test/Transforms/LoopVectorize/iv_outside_user.ll create mode 100644 llvm/test/Transforms/LoopVectorize/lcssa-crash.ll create mode 100644 llvm/test/Transforms/LoopVectorize/legal_preheader_check.ll create mode 100644 llvm/test/Transforms/LoopVectorize/libcall-remark.ll create mode 100644 llvm/test/Transforms/LoopVectorize/lifetime.ll create mode 100644 llvm/test/Transforms/LoopVectorize/loop-form.ll create mode 100644 llvm/test/Transforms/LoopVectorize/loop-scalars.ll create mode 100644 llvm/test/Transforms/LoopVectorize/loop-vect-memdep.ll create mode 100644 llvm/test/Transforms/LoopVectorize/memdep.ll create mode 100644 llvm/test/Transforms/LoopVectorize/metadata-unroll.ll create mode 100644 llvm/test/Transforms/LoopVectorize/metadata-width.ll create mode 100644 llvm/test/Transforms/LoopVectorize/metadata.ll create mode 100644 llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll create mode 100644 llvm/test/Transforms/LoopVectorize/miniters.ll create mode 100644 llvm/test/Transforms/LoopVectorize/minmax_reduction.ll create mode 100644 llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll create mode 100644 llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll create mode 100644 llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll create mode 100644 llvm/test/Transforms/LoopVectorize/no-interleave-up-front.ll create mode 100644 llvm/test/Transforms/LoopVectorize/no_array_bounds.ll create mode 100644 llvm/test/Transforms/LoopVectorize/no_idiv_reduction.ll create mode 100644 llvm/test/Transforms/LoopVectorize/no_int_induction.ll create mode 100644 llvm/test/Transforms/LoopVectorize/no_outside_user.ll create mode 100644 llvm/test/Transforms/LoopVectorize/no_switch.ll create mode 100644 llvm/test/Transforms/LoopVectorize/no_switch_disable_vectorization.ll create mode 100644 llvm/test/Transforms/LoopVectorize/noalias-md-licm.ll create mode 100644 llvm/test/Transforms/LoopVectorize/noalias-md.ll create mode 100644 llvm/test/Transforms/LoopVectorize/nofloat.ll create mode 100644 llvm/test/Transforms/LoopVectorize/non-const-n.ll create mode 100644 llvm/test/Transforms/LoopVectorize/nontemporal.ll create mode 100644 llvm/test/Transforms/LoopVectorize/nsw-crash.ll create mode 100644 llvm/test/Transforms/LoopVectorize/opt.ll create mode 100644 llvm/test/Transforms/LoopVectorize/optsize.ll create mode 100644 llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll create mode 100644 llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll create mode 100644 llvm/test/Transforms/LoopVectorize/partial-lcssa.ll create mode 100644 llvm/test/Transforms/LoopVectorize/phi-cost.ll create mode 100644 llvm/test/Transforms/LoopVectorize/phi-hang.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr25281.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr28541.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr30806-phi-scev.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr30806.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr31098.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr31190.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr32859.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr33706.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr34681.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr35743.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr35773.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr36311.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr36983.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr37248.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr37515.ll create mode 100755 llvm/test/Transforms/LoopVectorize/pr38800.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr39099.ll create mode 100644 llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll create mode 100644 llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll create mode 100644 llvm/test/Transforms/LoopVectorize/ptr-induction.ll create mode 100644 llvm/test/Transforms/LoopVectorize/ptr_loops.ll create mode 100644 llvm/test/Transforms/LoopVectorize/read-only.ll create mode 100644 llvm/test/Transforms/LoopVectorize/reduction-small-size.ll create mode 100644 llvm/test/Transforms/LoopVectorize/reduction.ll create mode 100644 llvm/test/Transforms/LoopVectorize/remove_metadata.ll create mode 100644 llvm/test/Transforms/LoopVectorize/reverse_induction.ll create mode 100644 llvm/test/Transforms/LoopVectorize/reverse_iter.ll create mode 100644 llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll create mode 100644 llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll create mode 100644 llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll create mode 100644 llvm/test/Transforms/LoopVectorize/runtime-check.ll create mode 100644 llvm/test/Transforms/LoopVectorize/runtime-limit.ll create mode 100644 llvm/test/Transforms/LoopVectorize/safegep.ll create mode 100644 llvm/test/Transforms/LoopVectorize/same-base-access.ll create mode 100644 llvm/test/Transforms/LoopVectorize/scalar-select.ll create mode 100644 llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll create mode 100644 llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll create mode 100644 llvm/test/Transforms/LoopVectorize/simple-unroll.ll create mode 100644 llvm/test/Transforms/LoopVectorize/skip-iterations.ll create mode 100644 llvm/test/Transforms/LoopVectorize/small-loop.ll create mode 100644 llvm/test/Transforms/LoopVectorize/start-non-zero.ll create mode 100644 llvm/test/Transforms/LoopVectorize/store-shuffle-bug.ll create mode 100644 llvm/test/Transforms/LoopVectorize/struct_access.ll create mode 100644 llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll create mode 100644 llvm/test/Transforms/LoopVectorize/tripcount.ll create mode 100644 llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll create mode 100644 llvm/test/Transforms/LoopVectorize/unroll-novec-memcheck-metadata.ll create mode 100644 llvm/test/Transforms/LoopVectorize/unroll.ll create mode 100644 llvm/test/Transforms/LoopVectorize/unroll_novec.ll create mode 100644 llvm/test/Transforms/LoopVectorize/unsafe-dep-remark.ll create mode 100644 llvm/test/Transforms/LoopVectorize/unsized-pointee-crash.ll create mode 100644 llvm/test/Transforms/LoopVectorize/value-ptr-bug.ll create mode 100644 llvm/test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll create mode 100644 llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll create mode 100644 llvm/test/Transforms/LoopVectorize/vect.stats.ll create mode 100644 llvm/test/Transforms/LoopVectorize/vector-geps.ll create mode 100644 llvm/test/Transforms/LoopVectorize/vectorize-once.ll create mode 100644 llvm/test/Transforms/LoopVectorize/version-mem-access.ll create mode 100644 llvm/test/Transforms/LoopVectorize/vplan-stress-test-no-explict-vf.ll create mode 100644 llvm/test/Transforms/LoopVectorize/vplan_hcfg_stress_test.ll create mode 100644 llvm/test/Transforms/LoopVectorize/write-only.ll create mode 100644 llvm/test/Transforms/LoopVectorize/zero-sized-pointee-crash.ll (limited to 'llvm/test/Transforms/LoopVectorize') diff --git a/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll new file mode 100644 index 00000000000..4766794ab45 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll @@ -0,0 +1,39 @@ +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +;CHECK-LABEL: @foo( +;CHECK: icmp eq <4 x i32> +;CHECK: select <4 x i1> +;CHECK: ret i32 +define i32 @foo(i32 %x, i32 %t, i32* nocapture %A) nounwind uwtable ssp { +entry: + %cmp10 = icmp sgt i32 %x, 0 + br i1 %cmp10, label %for.body, label %for.end + +for.body: ; preds = %entry, %if.end + %indvars.iv = phi i64 [ %indvars.iv.next, %if.end ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %tobool = icmp eq i32 %0, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %for.body + %1 = add nsw i64 %indvars.iv, 45 + %2 = trunc i64 %indvars.iv to i32 + %mul = mul nsw i32 %2, %t + %3 = trunc i64 %1 to i32 + %add1 = add nsw i32 %3, %mul + br label %if.end + +if.end: ; preds = %for.body, %if.then + %z.0 = phi i32 [ %add1, %if.then ], [ 9, %for.body ] + store i32 %z.0, i32* %arrayidx, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %x + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %if.end, %entry + ret i32 undef +} diff --git a/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll b/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll new file mode 100644 index 00000000000..b3eae690423 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll @@ -0,0 +1,71 @@ +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce + +; Check that we don't fall into an infinite loop. +define void @test() nounwind { +entry: + br label %for.body + +for.body: + %0 = phi i32 [ 1, %entry ], [ 0, %for.body ] + br label %for.body +} + + + +define void @test2() nounwind { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv47 = phi i64 [ 0, %entry ], [ %indvars.iv.next48, %for.body ] + %0 = phi i32 [ 1, %entry ], [ 0, %for.body ] + %indvars.iv.next48 = add i64 %indvars.iv47, 1 + br i1 undef, label %for.end, label %for.body + +for.end: ; preds = %for.body + unreachable +} + +;PR14701 +define void @start_model_rare() nounwind uwtable ssp { +entry: + br i1 undef, label %return, label %if.end + +if.end: ; preds = %entry + br i1 undef, label %cond.false, label %cond.true + +cond.true: ; preds = %if.end + unreachable + +cond.false: ; preds = %if.end + br i1 undef, label %cond.false28, label %cond.true20 + +cond.true20: ; preds = %cond.false + unreachable + +cond.false28: ; preds = %cond.false + br label %for.body40 + +for.body40: ; preds = %for.inc50, %cond.false28 + %indvars.iv123 = phi i64 [ 3, %cond.false28 ], [ %indvars.iv.next124, %for.inc50 ] + %step.0121 = phi i32 [ 1, %cond.false28 ], [ %step.1, %for.inc50 ] + br i1 undef, label %if.then46, label %for.inc50 + +if.then46: ; preds = %for.body40 + %inc47 = add nsw i32 %step.0121, 1 + br label %for.inc50 + +for.inc50: ; preds = %if.then46, %for.body40 + %k.1 = phi i32 [ undef, %for.body40 ], [ %inc47, %if.then46 ] + %step.1 = phi i32 [ %step.0121, %for.body40 ], [ %inc47, %if.then46 ] + %indvars.iv.next124 = add i64 %indvars.iv123, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next124 to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 256 + br i1 %exitcond, label %for.end52, label %for.body40 + +for.end52: ; preds = %for.inc50 + unreachable + +return: ; preds = %entry + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll b/llvm/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll new file mode 100644 index 00000000000..baf96b84a34 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll @@ -0,0 +1,53 @@ +; RUN: opt < %s -loop-vectorize -dce -force-vector-interleave=1 -force-vector-width=4 + +; Check that we don't crash. + +target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" + +module asm "\09.ident\09\22GCC: (GNU) 4.6.3 LLVM: 3.2svn\22" + +@b = common global [32000 x float] zeroinitializer, align 16 + +define i32 @set1ds(i32 %_n, float* nocapture %arr, float %value, i32 %stride) nounwind uwtable { +entry: + %0 = icmp sgt i32 %_n, 0 + br i1 %0, label %"3.lr.ph", label %"5" + +"3.lr.ph": ; preds = %entry + %1 = bitcast float* %arr to i8* + %2 = sext i32 %stride to i64 + br label %"3" + +"3": ; preds = %"3.lr.ph", %"3" + %indvars.iv = phi i64 [ 0, %"3.lr.ph" ], [ %indvars.iv.next, %"3" ] + %3 = shl nsw i64 %indvars.iv, 2 + %4 = getelementptr inbounds i8, i8* %1, i64 %3 + %5 = bitcast i8* %4 to float* + store float %value, float* %5, align 4 + %indvars.iv.next = add i64 %indvars.iv, %2 + %6 = trunc i64 %indvars.iv.next to i32 + %7 = icmp slt i32 %6, %_n + br i1 %7, label %"3", label %"5" + +"5": ; preds = %"3", %entry + ret i32 0 +} + +define i32 @init(i8* nocapture %name) unnamed_addr nounwind uwtable { +entry: + br label %"3" + +"3": ; preds = %"3", %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %"3" ] + %0 = shl nsw i64 %indvars.iv, 2 + %1 = getelementptr inbounds i8, i8* bitcast (float* getelementptr inbounds ([32000 x float], [32000 x float]* @b, i64 0, i64 16000) to i8*), i64 %0 + %2 = bitcast i8* %1 to float* + store float -1.000000e+00, float* %2, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 16000 + br i1 %exitcond, label %"5", label %"3" + +"5": ; preds = %"3" + ret i32 0 +} diff --git a/llvm/test/Transforms/LoopVectorize/2016-07-27-loop-vec.ll b/llvm/test/Transforms/LoopVectorize/2016-07-27-loop-vec.ll new file mode 100644 index 00000000000..f64dcb36f70 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/2016-07-27-loop-vec.ll @@ -0,0 +1,19 @@ +; RUN: opt < %s -loop-vectorize -S + +define void @foo() local_unnamed_addr { +entry: + %exitcond = icmp eq i64 3, 3 + br label %for.body + +for.body: ; preds = %entry + %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %total1 = add nsw i64 %i.05, 3 + %inc = add nuw nsw i64 %i.05, 1 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.body + ret void +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll new file mode 100644 index 00000000000..eb12803a344 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -0,0 +1,79 @@ +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST +; RUN: opt < %s -loop-vectorize -force-vector-width=2 -instcombine -simplifycfg -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; This test checks that we correctly compute the scalarized operands for a +; user-specified vectorization factor when interleaving is disabled. We use the +; "optsize" attribute to disable all interleaving calculations. A cost of 4 +; for %tmp4 indicates that we would scalarize it's operand (%tmp3), giving +; %tmp4 a lower scalarization overhead. +; +; COST-LABEL: predicated_udiv_scalarized_operand +; COST: LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3 +; +; CHECK-LABEL: @predicated_udiv_scalarized_operand( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %entry ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE2:.*]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %entry ], [ [[TMP17:%.*]], %[[PRED_UDIV_CONTINUE2]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, i64* %a, i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; CHECK-NEXT: br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]] +; CHECK: [[PRED_UDIV_IF]]: +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], %x +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i32 0 +; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE]] +; CHECK: [[PRED_UDIV_CONTINUE]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i64> [ undef, %vector.body ], [ [[TMP8]], %[[PRED_UDIV_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]] +; CHECK: [[PRED_UDIV_IF1]]: +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[TMP11]], %x +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1 +; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE2]] +; CHECK: [[PRED_UDIV_CONTINUE2]]: +; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i64> [ [[TMP9]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], %[[PRED_UDIV_IF1]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP16]], <2 x i64> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP17]] = add <2 x i64> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define i64 @predicated_udiv_scalarized_operand(i64* %a, i64 %x) optsize { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ] + %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i + %tmp2 = load i64, i64* %tmp0, align 4 + %cond0 = icmp sgt i64 %tmp2, 0 + br i1 %cond0, label %if.then, label %for.inc + +if.then: + %tmp3 = add nsw i64 %tmp2, %x + %tmp4 = udiv i64 %tmp2, %tmp3 + br label %for.inc + +for.inc: + %tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then] + %tmp6 = add i64 %r, %tmp5 + %i.next = add nuw nsw i64 %i, 1 + %cond1 = icmp slt i64 %i.next, 100 + br i1 %cond1, label %for.body, label %for.end + +for.end: + %tmp7 = phi i64 [ %tmp6, %for.inc ] + ret i64 %tmp7 +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll new file mode 100644 index 00000000000..a689f44e912 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll @@ -0,0 +1,42 @@ +; RUN: opt < %s -loop-vectorize -mtriple=aarch64-none-linux-gnu -mattr=+neon -S | FileCheck %s +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Function Attrs: nounwind +define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) { +;CHECK-LABEL: array_add +;CHECK: load <4 x i32> +;CHECK: load <4 x i32> +;CHECK: load <4 x i32> +;CHECK: load <4 x i32> +;CHECK: add nsw <4 x i32> +;CHECK: add nsw <4 x i32> +;CHECK: store <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret +entry: + %cmp10 = icmp sgt i32 %size, 0 + br i1 %cmp10, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + store i32 %add, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %size + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret i32* %c +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll b/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll new file mode 100644 index 00000000000..81ecd57b2a8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll @@ -0,0 +1,147 @@ +; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 | FileCheck %s +; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnueabi" + +; Test integer induction variable of step 2: +; for (int i = 0; i < 1024; i+=2) { +; int tmp = *A++; +; sum += i * tmp; +; } + +; CHECK-LABEL: @ind_plus2( +; CHECK: load <4 x i32>, <4 x i32>* +; CHECK: load <4 x i32>, <4 x i32>* +; CHECK: mul nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: %index.next = add i64 %index, 8 +; CHECK: icmp eq i64 %index.next, 512 + +; FORCE-VEC-LABEL: @ind_plus2( +; FORCE-VEC: %wide.load = load <2 x i32>, <2 x i32>* +; FORCE-VEC: mul nsw <2 x i32> +; FORCE-VEC: add nsw <2 x i32> +; FORCE-VEC: %index.next = add i64 %index, 2 +; FORCE-VEC: icmp eq i64 %index.next, 512 +define i32 @ind_plus2(i32* %A) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %A.addr = phi i32* [ %A, %entry ], [ %inc.ptr, %for.body ] + %i = phi i32 [ 0, %entry ], [ %add1, %for.body ] + %sum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %inc.ptr = getelementptr inbounds i32, i32* %A.addr, i64 1 + %0 = load i32, i32* %A.addr, align 4 + %mul = mul nsw i32 %0, %i + %add = add nsw i32 %mul, %sum + %add1 = add nsw i32 %i, 2 + %cmp = icmp slt i32 %add1, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + %add.lcssa = phi i32 [ %add, %for.body ] + ret i32 %add.lcssa +} + + +; Test integer induction variable of step -2: +; for (int i = 1024; i > 0; i-=2) { +; int tmp = *A++; +; sum += i * tmp; +; } + +; CHECK-LABEL: @ind_minus2( +; CHECK: load <4 x i32>, <4 x i32>* +; CHECK: load <4 x i32>, <4 x i32>* +; CHECK: mul nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: %index.next = add i64 %index, 8 +; CHECK: icmp eq i64 %index.next, 512 + +; FORCE-VEC-LABEL: @ind_minus2( +; FORCE-VEC: %wide.load = load <2 x i32>, <2 x i32>* +; FORCE-VEC: mul nsw <2 x i32> +; FORCE-VEC: add nsw <2 x i32> +; FORCE-VEC: %index.next = add i64 %index, 2 +; FORCE-VEC: icmp eq i64 %index.next, 512 +define i32 @ind_minus2(i32* %A) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %A.addr = phi i32* [ %A, %entry ], [ %inc.ptr, %for.body ] + %i = phi i32 [ 1024, %entry ], [ %sub, %for.body ] + %sum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %inc.ptr = getelementptr inbounds i32, i32* %A.addr, i64 1 + %0 = load i32, i32* %A.addr, align 4 + %mul = mul nsw i32 %0, %i + %add = add nsw i32 %mul, %sum + %sub = add nsw i32 %i, -2 + %cmp = icmp sgt i32 %i, 2 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + %add.lcssa = phi i32 [ %add, %for.body ] + ret i32 %add.lcssa +} + + +; Test pointer induction variable of step 2. As currently we don't support +; masked load/store, vectorization is possible but not beneficial. If loop +; vectorization is not enforced, LV will only do interleave. +; for (int i = 0; i < 1024; i++) { +; int tmp0 = *A++; +; int tmp1 = *A++; +; sum += tmp0 * tmp1; +; } + +; CHECK-LABEL: @ptr_ind_plus2( +; CHECK: %[[V0:.*]] = load <8 x i32> +; CHECK: %[[V1:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: %index.next = add i64 %index, 8 +; CHECK: icmp eq i64 %index.next, 1024 + +; FORCE-VEC-LABEL: @ptr_ind_plus2( +; FORCE-VEC: %[[V:.*]] = load <4 x i32> +; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> +; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> +; FORCE-VEC: mul nsw <2 x i32> +; FORCE-VEC: add nsw <2 x i32> +; FORCE-VEC: %index.next = add i64 %index, 2 +; FORCE-VEC: icmp eq i64 %index.next, 1024 +define i32 @ptr_ind_plus2(i32* %A) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %A.addr = phi i32* [ %A, %entry ], [ %inc.ptr1, %for.body ] + %sum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %inc.ptr = getelementptr inbounds i32, i32* %A.addr, i64 1 + %0 = load i32, i32* %A.addr, align 4 + %inc.ptr1 = getelementptr inbounds i32, i32* %A.addr, i64 2 + %1 = load i32, i32* %inc.ptr, align 4 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, %sum + %inc = add nsw i32 %i, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + %add.lcssa = phi i32 [ %add, %for.body ] + ret i32 %add.lcssa +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll new file mode 100644 index 00000000000..395b468c509 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll @@ -0,0 +1,42 @@ +; RUN: opt < %s -loop-vectorize -mtriple=arm64-none-linux-gnu -mattr=+neon -S | FileCheck %s +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Function Attrs: nounwind +define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) { +;CHECK-LABEL: array_add +;CHECK: load <4 x i32> +;CHECK: load <4 x i32> +;CHECK: load <4 x i32> +;CHECK: load <4 x i32> +;CHECK: add nsw <4 x i32> +;CHECK: add nsw <4 x i32> +;CHECK: store <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret +entry: + %cmp10 = icmp sgt i32 %size, 0 + br i1 %cmp10, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + store i32 %add, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %size + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret i32* %c +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll b/llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll new file mode 100644 index 00000000000..aba47f6c628 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll @@ -0,0 +1,166 @@ +; RUN: opt -mtriple=aarch64--linux-gnueabi -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s -S | FileCheck %s + +; The following tests contain loops for which SCEV cannot determine the backedge +; taken count. This is because the backedge taken condition is produced by an +; icmp with one of the sides being a loop varying non-AddRec expression. +; However, there is a possibility to normalize this to an AddRec expression +; using SCEV predicates. This allows us to compute a 'guarded' backedge count. +; The Loop Vectorizer is able to version to loop in order to use this guarded +; backedge count and vectorize more loops. + + +; CHECK-LABEL: test_sge +; CHECK-LABEL: vector.scevcheck +; CHECK-LABEL: vector.body +define void @test_sge(i32* noalias %A, + i32* noalias %B, + i32* noalias %C, i32 %N) { +entry: + %cmp13 = icmp eq i32 %N, 0 + br i1 %cmp13, label %for.end, label %for.body.preheader + +for.body.preheader: + br label %for.body + +for.body: + %indvars.iv = phi i16 [ %indvars.next, %for.body ], [ 0, %for.body.preheader ] + %indvars.next = add i16 %indvars.iv, 1 + %indvars.ext = zext i16 %indvars.iv to i32 + + %arrayidx = getelementptr inbounds i32, i32* %B, i32 %indvars.ext + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %indvars.ext + %1 = load i32, i32* %arrayidx3, align 4 + + %mul4 = mul i32 %1, %0 + + %arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %indvars.ext + store i32 %mul4, i32* %arrayidx7, align 4 + + %exitcond = icmp sge i32 %indvars.ext, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: test_uge +; CHECK-LABEL: vector.scevcheck +; CHECK-LABEL: vector.body +define void @test_uge(i32* noalias %A, + i32* noalias %B, + i32* noalias %C, i32 %N, i32 %Offset) { +entry: + %cmp13 = icmp eq i32 %N, 0 + br i1 %cmp13, label %for.end, label %for.body.preheader + +for.body.preheader: + br label %for.body + +for.body: + %indvars.iv = phi i16 [ %indvars.next, %for.body ], [ 0, %for.body.preheader ] + %indvars.next = add i16 %indvars.iv, 1 + + %indvars.ext = sext i16 %indvars.iv to i32 + %indvars.access = add i32 %Offset, %indvars.ext + + %arrayidx = getelementptr inbounds i32, i32* %B, i32 %indvars.access + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %indvars.access + %1 = load i32, i32* %arrayidx3, align 4 + + %mul4 = add i32 %1, %0 + + %arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %indvars.access + store i32 %mul4, i32* %arrayidx7, align 4 + + %exitcond = icmp uge i32 %indvars.ext, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: test_ule +; CHECK-LABEL: vector.scevcheck +; CHECK-LABEL: vector.body +define void @test_ule(i32* noalias %A, + i32* noalias %B, + i32* noalias %C, i32 %N, + i16 %M) { +entry: + %cmp13 = icmp eq i32 %N, 0 + br i1 %cmp13, label %for.end, label %for.body.preheader + +for.body.preheader: + br label %for.body + +for.body: + %indvars.iv = phi i16 [ %indvars.next, %for.body ], [ %M, %for.body.preheader ] + %indvars.next = sub i16 %indvars.iv, 1 + %indvars.ext = zext i16 %indvars.iv to i32 + + %arrayidx = getelementptr inbounds i32, i32* %B, i32 %indvars.ext + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %indvars.ext + %1 = load i32, i32* %arrayidx3, align 4 + + %mul4 = mul i32 %1, %0 + + %arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %indvars.ext + store i32 %mul4, i32* %arrayidx7, align 4 + + %exitcond = icmp ule i32 %indvars.ext, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: test_sle +; CHECK-LABEL: vector.scevcheck +; CHECK-LABEL: vector.body +define void @test_sle(i32* noalias %A, + i32* noalias %B, + i32* noalias %C, i32 %N, + i16 %M) { +entry: + %cmp13 = icmp eq i32 %N, 0 + br i1 %cmp13, label %for.end, label %for.body.preheader + +for.body.preheader: + br label %for.body + +for.body: + %indvars.iv = phi i16 [ %indvars.next, %for.body ], [ %M, %for.body.preheader ] + %indvars.next = sub i16 %indvars.iv, 1 + %indvars.ext = sext i16 %indvars.iv to i32 + + %arrayidx = getelementptr inbounds i32, i32* %B, i32 %indvars.ext + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %indvars.ext + %1 = load i32, i32* %arrayidx3, align 4 + + %mul4 = mul i32 %1, %0 + + %arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %indvars.ext + store i32 %mul4, i32* %arrayidx7, align 4 + + %exitcond = icmp sle i32 %indvars.ext, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll new file mode 100644 index 00000000000..65f5c4e6266 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll @@ -0,0 +1,54 @@ +; RUN: opt -S < %s -loop-vectorize -instcombine 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +;; See https://llvm.org/bugs/show_bug.cgi?id=25490 +;; Due to the data structures used, the LLVM IR was not determinisic. +;; This test comes from the PR. + +;; CHECK-LABEL: @test( +; CHECK: load <16 x i8> +; CHECK-NEXT: getelementptr +; CHECK-NEXT: bitcast +; CHECK-NEXT: load <16 x i8> +; CHECK-NEXT: zext <16 x i8> +; CHECK-NEXT: zext <16 x i8> +define void @test(i32 %n, i8* nocapture %a, i8* nocapture %b, i8* nocapture readonly %c) { +entry: + %cmp.28 = icmp eq i32 %n, 0 + br i1 %cmp.28, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i8, i8* %c, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %0 to i32 + %arrayidx2 = getelementptr inbounds i8, i8* %a, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2, align 1 + %conv3 = zext i8 %1 to i32 + %mul = mul nuw nsw i32 %conv3, %conv + %shr.26 = lshr i32 %mul, 8 + %conv4 = trunc i32 %shr.26 to i8 + store i8 %conv4, i8* %arrayidx2, align 1 + %arrayidx8 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv + %2 = load i8, i8* %arrayidx8, align 1 + %conv9 = zext i8 %2 to i32 + %mul10 = mul nuw nsw i32 %conv9, %conv + %shr11.27 = lshr i32 %mul10, 8 + %conv12 = trunc i32 %shr11.27 to i8 + store i8 %conv12, i8* %arrayidx8, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll new file mode 100644 index 00000000000..a40dafe6ec2 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll @@ -0,0 +1,85 @@ +; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone -enable-interleaved-mem-accesses=false < %s | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128" + +@kernel = global [512 x float] zeroinitializer, align 16 +@kernel2 = global [512 x float] zeroinitializer, align 16 +@kernel3 = global [512 x float] zeroinitializer, align 16 +@kernel4 = global [512 x float] zeroinitializer, align 16 +@src_data = global [1536 x float] zeroinitializer, align 16 +@r_ = global i8 0, align 1 +@g_ = global i8 0, align 1 +@b_ = global i8 0, align 1 + +; We don't want to vectorize most loops containing gathers because they are +; expensive. +; Make sure we don't vectorize it. +; CHECK-NOT: x float> + +define void @_Z4testmm(i64 %size, i64 %offset) { +entry: + %cmp53 = icmp eq i64 %size, 0 + br i1 %cmp53, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: + br label %for.body + +for.body: + %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ] + %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ] + %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ] + %add = add i64 %v.055, %offset + %mul = mul i64 %add, 3 + %arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %mul + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 %v.055 + %1 = load float, float* %arrayidx2, align 4 + %mul3 = fmul fast float %0, %1 + %arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 %v.055 + %2 = load float, float* %arrayidx4, align 4 + %mul5 = fmul fast float %mul3, %2 + %arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 %v.055 + %3 = load float, float* %arrayidx6, align 4 + %mul7 = fmul fast float %mul5, %3 + %arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 %v.055 + %4 = load float, float* %arrayidx8, align 4 + %mul9 = fmul fast float %mul7, %4 + %add10 = fadd fast float %r.057, %mul9 + %arrayidx.sum = add i64 %mul, 1 + %arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum + %5 = load float, float* %arrayidx11, align 4 + %mul13 = fmul fast float %1, %5 + %mul15 = fmul fast float %2, %mul13 + %mul17 = fmul fast float %3, %mul15 + %mul19 = fmul fast float %4, %mul17 + %add20 = fadd fast float %g.056, %mul19 + %arrayidx.sum52 = add i64 %mul, 2 + %arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52 + %6 = load float, float* %arrayidx21, align 4 + %mul23 = fmul fast float %1, %6 + %mul25 = fmul fast float %2, %mul23 + %mul27 = fmul fast float %3, %mul25 + %mul29 = fmul fast float %4, %mul27 + %add30 = fadd fast float %b.054, %mul29 + %inc = add i64 %v.055, 1 + %exitcond = icmp ne i64 %inc, %size + br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: + %add30.lcssa = phi float [ %add30, %for.body ] + %add20.lcssa = phi float [ %add20, %for.body ] + %add10.lcssa = phi float [ %add10, %for.body ] + %phitmp = fptoui float %add10.lcssa to i8 + %phitmp60 = fptoui float %add20.lcssa to i8 + %phitmp61 = fptoui float %add30.lcssa to i8 + br label %for.end + +for.end: + %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] + %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ] + %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ] + store i8 %r.0.lcssa, i8* @r_, align 1 + store i8 %g.0.lcssa, i8* @g_, align 1 + store i8 %b.0.lcssa, i8* @b_, align 1 + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll new file mode 100644 index 00000000000..e8ef4256235 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll @@ -0,0 +1,30 @@ +; RUN: opt < %s -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: @non_primary_iv_trunc_free( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5 +; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 5 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDUCTION]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDUCTION1]] to i32 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @non_primary_iv_trunc_free(i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = trunc i64 %i to i32 + %i.next = add nuw nsw i64 %i, 5 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll new file mode 100644 index 00000000000..10405b8cd16 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll @@ -0,0 +1,37 @@ +; REQUIRES: asserts +; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s + +; This test shows extremely high interleaving cost that, probably, should be fixed. +; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize +; the load instructions. + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +%pair = type { i8, i8 } + +; CHECK-LABEL: test +; CHECK: Found an estimated cost of 20 for VF 2 For instruction: {{.*}} load i8 +; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8 +; CHECK: vector.body +; CHECK: load i8 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body + +define void @test(%pair* %p, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr %pair, %pair* %p, i64 %i, i32 0 + %tmp1 = load i8, i8* %tmp0, align 1 + %tmp2 = getelementptr %pair, %pair* %p, i64 %i, i32 1 + %tmp3 = load i8, i8* %tmp2, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll new file mode 100644 index 00000000000..54ee8fc6e73 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -0,0 +1,189 @@ +; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2 +; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4 +; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8 +; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16 +; REQUIRES: asserts + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnueabi" + +%i8.2 = type {i8, i8} +define void @i8_factor_2(%i8.2* %data, i64 %n) { +entry: + br label %for.body + +; VF_8-LABEL: Checking a loop in "i8_factor_2" +; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1 +; VF_16-LABEL: Checking a loop in "i8_factor_2" +; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1 + %tmp2 = load i8, i8* %tmp0, align 1 + %tmp3 = load i8, i8* %tmp1, align 1 + store i8 0, i8* %tmp0, align 1 + store i8 0, i8* %tmp1, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +%i16.2 = type {i16, i16} +define void @i16_factor_2(%i16.2* %data, i64 %n) { +entry: + br label %for.body + +; VF_4-LABEL: Checking a loop in "i16_factor_2" +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_8-LABEL: Checking a loop in "i16_factor_2" +; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_16-LABEL: Checking a loop in "i16_factor_2" +; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1 + %tmp2 = load i16, i16* %tmp0, align 2 + %tmp3 = load i16, i16* %tmp1, align 2 + store i16 0, i16* %tmp0, align 2 + store i16 0, i16* %tmp1, align 2 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +%i32.2 = type {i32, i32} +define void @i32_factor_2(%i32.2* %data, i64 %n) { +entry: + br label %for.body + +; VF_2-LABEL: Checking a loop in "i32_factor_2" +; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_4-LABEL: Checking a loop in "i32_factor_2" +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_8-LABEL: Checking a loop in "i32_factor_2" +; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_16-LABEL: Checking a loop in "i32_factor_2" +; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1 + %tmp2 = load i32, i32* %tmp0, align 4 + %tmp3 = load i32, i32* %tmp1, align 4 + store i32 0, i32* %tmp0, align 4 + store i32 0, i32* %tmp1, align 4 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +%i64.2 = type {i64, i64} +define void @i64_factor_2(%i64.2* %data, i64 %n) { +entry: + br label %for.body + +; VF_2-LABEL: Checking a loop in "i64_factor_2" +; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_4-LABEL: Checking a loop in "i64_factor_2" +; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 4 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_8-LABEL: Checking a loop in "i64_factor_2" +; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_16-LABEL: Checking a loop in "i64_factor_2" +; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 1 + %tmp2 = load i64, i64* %tmp0, align 8 + %tmp3 = load i64, i64* %tmp1, align 8 + store i64 0, i64* %tmp0, align 8 + store i64 0, i64* %tmp1, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +%i64.8 = type {i64, i64, i64, i64, i64, i64, i64, i64} +define void @i64_factor_8(%i64.8* %data, i64 %n) { +entry: + br label %for.body + +; The interleave factor in this test is 8, which is greater than the maximum +; allowed factor for AArch64 (4). Thus, we will fall back to the basic TTI +; implementation for determining the cost of the interleaved load group. The +; stores do not form a legal interleaved group because the group would contain +; gaps. +; +; VF_2-LABEL: Checking a loop in "i64_factor_8" +; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2 + %tmp1 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 6 + %tmp2 = load i64, i64* %tmp0, align 8 + %tmp3 = load i64, i64* %tmp1, align 8 + store i64 0, i64* %tmp0, align 8 + store i64 0, i64* %tmp1, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/AArch64/lit.local.cfg new file mode 100644 index 00000000000..937cffb2c11 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/lit.local.cfg @@ -0,0 +1,5 @@ +config.suffixes = ['.ll'] + +if not 'AArch64' in config.root.targets: + config.unsupported = True + diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll new file mode 100644 index 00000000000..1149afe7b9f --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -0,0 +1,310 @@ +; RUN: opt -S < %s -basicaa -loop-vectorize -force-vector-interleave=1 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +; CHECK-LABEL: @add_a( +; CHECK: load <16 x i8>, <16 x i8>* +; CHECK: add <16 x i8> +; CHECK: store <16 x i8> +; Function Attrs: nounwind +define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp8 = icmp sgt i32 %len, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i32 + %add = add nuw nsw i32 %conv, 2 + %conv1 = trunc i32 %add to i8 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + store i8 %conv1, i8* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; Ensure that we preserve nuw/nsw if we're not shrinking the values we're +; working with. +; CHECK-LABEL: @add_a1( +; CHECK: load <16 x i8>, <16 x i8>* +; CHECK: add nuw nsw <16 x i8> +; CHECK: store <16 x i8> +; Function Attrs: nounwind +define void @add_a1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp8 = icmp sgt i32 %len, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %add = add nuw nsw i8 %0, 2 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + store i8 %add, i8* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_b( +; CHECK: load <8 x i16>, <8 x i16>* +; CHECK: add <8 x i16> +; CHECK: store <8 x i16> +; Function Attrs: nounwind +define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp9 = icmp sgt i32 %len, 0 + br i1 %cmp9, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv + %0 = load i16, i16* %arrayidx + %conv8 = zext i16 %0 to i32 + %add = add nuw nsw i32 %conv8, 2 + %conv1 = trunc i32 %add to i16 + %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv + store i16 %conv1, i16* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_c( +; CHECK: load <8 x i8>, <8 x i8>* +; CHECK: add <8 x i16> +; CHECK: store <8 x i16> +; Function Attrs: nounwind +define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp8 = icmp sgt i32 %len, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i32 + %add = add nuw nsw i32 %conv, 2 + %conv1 = trunc i32 %add to i16 + %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv + store i16 %conv1, i16* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_d( +; CHECK: load <4 x i16> +; CHECK: add nsw <4 x i32> +; CHECK: store <4 x i32> +define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp7 = icmp sgt i32 %len, 0 + br i1 %cmp7, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv + %0 = load i16, i16* %arrayidx + %conv = sext i16 %0 to i32 + %add = add nsw i32 %conv, 2 + %arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv + store i32 %add, i32* %arrayidx2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_e( +; CHECK: load <16 x i8> +; CHECK: shl <16 x i8> +; CHECK: add <16 x i8> +; CHECK: or <16 x i8> +; CHECK: mul <16 x i8> +; CHECK: and <16 x i8> +; CHECK: xor <16 x i8> +; CHECK: mul <16 x i8> +; CHECK: store <16 x i8> +define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 { +entry: + %cmp.32 = icmp sgt i32 %len, 0 + br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %conv11 = zext i8 %arg2 to i32 + %conv13 = zext i8 %arg1 to i32 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i32 + %add = shl i32 %conv, 4 + %conv2 = add nuw nsw i32 %add, 32 + %or = or i32 %conv, 51 + %mul = mul nuw nsw i32 %or, 60 + %and = and i32 %conv2, %conv13 + %mul.masked = and i32 %mul, 252 + %conv17 = xor i32 %mul.masked, %conv11 + %mul18 = mul nuw nsw i32 %conv17, %and + %conv19 = trunc i32 %mul18 to i8 + %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + store i8 %conv19, i8* %arrayidx21 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_f +; CHECK: load <8 x i16> +; CHECK: trunc <8 x i16> +; CHECK: shl <8 x i8> +; CHECK: add <8 x i8> +; CHECK: or <8 x i8> +; CHECK: mul <8 x i8> +; CHECK: and <8 x i8> +; CHECK: xor <8 x i8> +; CHECK: mul <8 x i8> +; CHECK: store <8 x i8> +define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 { +entry: + %cmp.32 = icmp sgt i32 %len, 0 + br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %conv11 = zext i8 %arg2 to i32 + %conv13 = zext i8 %arg1 to i32 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv + %0 = load i16, i16* %arrayidx + %conv = sext i16 %0 to i32 + %add = shl i32 %conv, 4 + %conv2 = add nsw i32 %add, 32 + %or = and i32 %conv, 204 + %conv8 = or i32 %or, 51 + %mul = mul nuw nsw i32 %conv8, 60 + %and = and i32 %conv2, %conv13 + %mul.masked = and i32 %mul, 252 + %conv17 = xor i32 %mul.masked, %conv11 + %mul18 = mul nuw nsw i32 %conv17, %and + %conv19 = trunc i32 %mul18 to i8 + %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + store i8 %conv19, i8* %arrayidx21 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_phifail( +; CHECK: load <16 x i8>, <16 x i8>* +; CHECK: add nuw nsw <16 x i32> +; CHECK: store <16 x i8> +; Function Attrs: nounwind +define void @add_phifail(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp8 = icmp sgt i32 %len, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i32 + %add = add nuw nsw i32 %conv, 2 + %conv1 = trunc i32 %add to i8 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + store i8 %conv1, i8* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; Function Attrs: nounwind +; When we vectorize this loop, we generate correct code +; even when %len exactly divides VF (since we extract from the second last index +; and pass this to the for.cond.cleanup block). Vectorized loop returns +; the correct value a_phi = p[len -2] +define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { +; CHECK-LABEL: @add_phifail2( +; CHECK: vector.body: +; CHECK: %wide.load = load <16 x i8>, <16 x i8>* +; CHECK: %[[L1:.+]] = zext <16 x i8> %wide.load to <16 x i32> +; CHECK: add nuw nsw <16 x i32> +; CHECK: store <16 x i8> +; CHECK: add i64 %index, 16 +; CHECK: icmp eq i64 %index.next, %n.vec +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <16 x i32> %[[L1]], i32 15 +; CHECK: %vector.recur.extract.for.phi = extractelement <16 x i32> %[[L1]], i32 14 +; CHECK: for.cond.cleanup: +; CHECK: %a_phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ] +; CHECK: %ret = trunc i32 %a_phi.lcssa to i8 +; CHECK: ret i8 %ret +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %ret = trunc i32 %a_phi to i8 + ret i8 %ret + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i32 + %add = add nuw nsw i32 %conv, 2 + %conv1 = trunc i32 %add to i8 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + store i8 %conv1, i8* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } + diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/max-vf-for-interleaved.ll b/llvm/test/Transforms/LoopVectorize/AArch64/max-vf-for-interleaved.ll new file mode 100644 index 00000000000..8b9589aebba --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/max-vf-for-interleaved.ll @@ -0,0 +1,56 @@ +; RUN: opt < %s -force-vector-interleave=1 -store-to-load-forwarding-conflict-detection=false -loop-vectorize -dce -instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +%struct.pair = type { i32, i32 } + +; Check vectorization of interleaved access groups with positive dependence +; distances. In this test, the maximum safe dependence distance for +; vectorization is 16 bytes. Normally, this would lead to a maximum VF of 4. +; However, for interleaved groups, the effective VF is VF * IF, where IF is the +; interleave factor. Here, the maximum safe dependence distance is recomputed +; as 16 / IF bytes, resulting in VF=2. Since IF=2, we should generate <4 x i32> +; loads and stores instead of <8 x i32> accesses. +; +; Note: LAA's conflict detection optimization has to be disabled for this test +; to be vectorized. + +; struct pair { +; int x; +; int y; +; }; +; +; void max_vf(struct pair *restrict p) { +; for (int i = 0; i < 1000; i++) { +; p[i + 2].x = p[i].x +; p[i + 2].y = p[i].y +; } +; } + +; CHECK-LABEL: @max_vf +; CHECK: load <4 x i32> +; CHECK: store <4 x i32> + +define void @max_vf(%struct.pair* noalias nocapture %p) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %0 = add nuw nsw i64 %i, 2 + %p_i.x = getelementptr inbounds %struct.pair, %struct.pair* %p, i64 %i, i32 0 + %p_i_plus_2.x = getelementptr inbounds %struct.pair, %struct.pair* %p, i64 %0, i32 0 + %1 = load i32, i32* %p_i.x, align 4 + store i32 %1, i32* %p_i_plus_2.x, align 4 + %p_i.y = getelementptr inbounds %struct.pair, %struct.pair* %p, i64 %i, i32 1 + %p_i_plus_2.y = getelementptr inbounds %struct.pair, %struct.pair* %p, i64 %0, i32 1 + %2 = load i32, i32* %p_i.y, align 4 + store i32 %2, i32* %p_i_plus_2.y, align 4 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, 1000 + br i1 %cond, label %for.exit, label %for.body + +for.exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll new file mode 100644 index 00000000000..247ea35ff5d --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll @@ -0,0 +1,49 @@ +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -S -debug-only=loop-vectorize 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: all_scalar +; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions +; +define void @all_scalar(i64* %a, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr i64, i64* %a, i64 %i + store i64 0, i64* %tmp0, align 1 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + +; CHECK-LABEL: PR33193 +; CHECK: LV: Found scalar instruction: %i.next = zext i32 %j.next to i64 +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64 +; CHECK: LV: Not considering vector loop of width 8 because it will not generate any vector instructions +%struct.a = type { i32, i8 } +define void @PR33193(%struct.a* %a, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %j = phi i32 [ 0, %entry ], [ %j.next, %for.body ] + %tmp0 = getelementptr inbounds %struct.a, %struct.a* %a, i64 %i, i32 1 + store i8 0, i8* %tmp0, align 4 + %j.next = add i32 %j, 1 + %i.next = zext i32 %j.next to i64 + %cond = icmp ugt i64 %n, %i.next + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll new file mode 100644 index 00000000000..aa8478b0b6a --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll @@ -0,0 +1,144 @@ +; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple aarch64-gnu-linux < %s | FileCheck %s + +; extern int arr[8][8]; +; extern int arr2[8]; +; +; void foo(int n) +; { +; int i1, i2; +; +; #pragma clang loop vectorize(enable) +; for (i1 = 0; i1 < 8; i1++) { +; arr2[i1] = i1; +; for (i2 = 0; i2 < 8; i2++) +; arr[i2][i1] = i1 + n; +; } +; } +; + +; CHECK-LABEL: @foo_i32( +; CHECK-LABEL: vector.ph: +; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> undef, i32 %n, i32 0 +; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> undef, <4 x i32> zeroinitializer + +; CHECK-LABEL: vector.body: +; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ] +; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ , %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ] +; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <4 x i64> %[[VecInd]] +; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> +; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[VecIndTr]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> ) +; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> +; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]] +; CHECK: br label %[[InnerLoop:.+]] + +; CHECK: [[InnerLoop]]: +; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ] +; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]] +; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> %[[InnerPhi]], +; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], +; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0 +; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]] + +; CHECK: [[ForInc]]: +; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4 +; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], +; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8 +; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body + +@arr2 = external global [8 x i32], align 16 +@arr = external global [8 x [8 x i32]], align 16 + +@arrX = external global [8 x i64], align 16 +@arrY = external global [8 x [8 x i64]], align 16 + +; Function Attrs: norecurse nounwind uwtable +define void @foo_i32(i32 %n) { +entry: + br label %for.body + +for.body: ; preds = %for.inc8, %entry + %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ] + %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, i64 %indvars.iv21 + %0 = trunc i64 %indvars.iv21 to i32 + store i32 %0, i32* %arrayidx, align 4 + %1 = trunc i64 %indvars.iv21 to i32 + %add = add nsw i32 %1, %n + br label %for.body3 + +for.body3: ; preds = %for.body3, %for.body + %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ] + %arrayidx7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21 + store i32 %add, i32* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 8 + br i1 %exitcond, label %for.inc8, label %for.body3 + +for.inc8: ; preds = %for.body3 + %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 + %exitcond23 = icmp eq i64 %indvars.iv.next22, 8 + br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1 + +for.end10: ; preds = %for.inc8 + ret void +} + +; CHECK-LABEL: @foo_i64( +; CHECK-LABEL: vector.ph: +; CHECK: %[[SplatVal:.*]] = insertelement <2 x i64> undef, i64 %n, i32 0 +; CHECK: %[[Splat:.*]] = shufflevector <2 x i64> %[[SplatVal]], <2 x i64> undef, <2 x i32> zeroinitializer + +; CHECK-LABEL: vector.body: +; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ] +; CHECK: %[[VecInd:.*]] = phi <2 x i64> [ , %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ] +; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i64], [8 x i64]* @arrX, i64 0, <2 x i64> %[[VecInd]] +; CHECK: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> %[[VecInd]], <2 x i64*> %[[AAddr]], i32 4, <2 x i1> ) +; CHECK: %[[StoreVal:.*]] = add nsw <2 x i64> %[[VecInd]], %[[Splat]] +; CHECK: br label %[[InnerLoop:.+]] + +; CHECK: [[InnerLoop]]: +; CHECK: %[[InnerPhi:.*]] = phi <2 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ] +; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* @arrY, i64 0, <2 x i64> %[[InnerPhi]], <2 x i64> %[[VecInd]] +; CHECK: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> %[[StoreVal]], <2 x i64*> %[[AAddr2]], i32 4, <2 x i1> +; CHECK: %[[InnerPhiNext]] = add nuw nsw <2 x i64> %[[InnerPhi]], +; CHECK: %[[VecCond:.*]] = icmp eq <2 x i64> %[[InnerPhiNext]], +; CHECK: %[[InnerCond:.*]] = extractelement <2 x i1> %[[VecCond]], i32 0 +; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]] + +; CHECK: [[ForInc]]: +; CHECK: %[[IndNext]] = add i64 %[[Ind]], 2 +; CHECK: %[[VecIndNext]] = add <2 x i64> %[[VecInd]], +; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8 +; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body +; Function Attrs: norecurse nounwind uwtable +define void @foo_i64(i64 %n) { +entry: + br label %for.body + +for.body: ; preds = %for.inc8, %entry + %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ] + %arrayidx = getelementptr inbounds [8 x i64], [8 x i64]* @arrX, i64 0, i64 %indvars.iv21 + store i64 %indvars.iv21, i64* %arrayidx, align 4 + %add = add nsw i64 %indvars.iv21, %n + br label %for.body3 + +for.body3: ; preds = %for.body3, %for.body + %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ] + %arrayidx7 = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* @arrY, i64 0, i64 %indvars.iv, i64 %indvars.iv21 + store i64 %add, i64* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 8 + br i1 %exitcond, label %for.inc8, label %for.body3 + +for.inc8: ; preds = %for.body3 + %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 + %exitcond23 = icmp eq i64 %indvars.iv.next22, 8 + br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1 + +for.end10: ; preds = %for.inc8 + ret void +} + + +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr31900.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr31900.ll new file mode 100644 index 00000000000..5ea38a4a246 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr31900.ll @@ -0,0 +1,37 @@ +; RUN: opt -S -mtriple=aarch64-apple-ios -loop-vectorize -enable-interleaved-mem-accesses -force-vector-width=2 < %s | FileCheck %s + +; Reproducer for address space fault in the LoopVectorizer (pr31900). Added +; different sized address space pointers (p:16:16-p4:32:16) to the aarch64 +; datalayout to reproduce the fault. + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128-p:16:16-p4:32:16" + +; Check that all the loads are scalarized +; CHECK: load i16, i16* +; CHECK: load i16, i16* +; CHECK: load i16, i16 addrspace(4)* +; CHECK: load i16, i16 addrspace(4)* + +%rec1445 = type { i16, i16, i16, i16, i16 } + +define void @foo() { +bb1: + br label %bb4 + +bb4: + %tmp1 = phi i16 [ undef, %bb1 ], [ %_tmp1013, %bb4 ] + %tmp2 = phi %rec1445* [ undef, %bb1 ], [ %_tmp1015, %bb4 ] + %tmp3 = phi %rec1445 addrspace(4)* [ undef, %bb1 ], [ %_tmp1017, %bb4 ] + %0 = getelementptr %rec1445, %rec1445* %tmp2, i16 0, i32 1 + %_tmp987 = load i16, i16* %0, align 1 + %1 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 0, i32 1 + %_tmp993 = load i16, i16 addrspace(4)* %1, align 1 + %_tmp1013 = add i16 %tmp1, 1 + %_tmp1015 = getelementptr %rec1445, %rec1445* %tmp2, i16 1 + %_tmp1017 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 1 + %_tmp1019 = icmp ult i16 %_tmp1013, 24 + br i1 %_tmp1019, label %bb4, label %bb16 + +bb16: + unreachable +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll new file mode 100644 index 00000000000..6763940bf98 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll @@ -0,0 +1,56 @@ +; RUN: opt -S -mtriple=aarch64 -loop-vectorize -force-vector-width=2 < %s | FileCheck %s +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +@b = common local_unnamed_addr global i32 0, align 4 +@a = common local_unnamed_addr global i16* null, align 8 + +; Function Attrs: norecurse nounwind readonly +define i32 @fn1() local_unnamed_addr #0 { +; Ensure that we don't emit reduction intrinsics for unsupported short reductions. +; CHECK-NOT: @llvm.experimental.vector.reduce +entry: + %0 = load i32, i32* @b, align 4, !tbaa !1 + %cmp40 = icmp sgt i32 %0, 0 + br i1 %cmp40, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %1 = load i16*, i16** @a, align 8, !tbaa !5 + %2 = load i32, i32* @b, align 4, !tbaa !1 + %3 = sext i32 %2 to i64 + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %d.043 = phi i16 [ undef, %for.body.lr.ph ], [ %.sink28, %for.body ] + %c.042 = phi i16 [ undef, %for.body.lr.ph ], [ %c.0., %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %1, i64 %indvars.iv + %4 = load i16, i16* %arrayidx, align 2, !tbaa !7 + %cmp2 = icmp sgt i16 %c.042, %4 + %c.0. = select i1 %cmp2, i16 %c.042, i16 %4 + %cmp13 = icmp slt i16 %d.043, %4 + %.sink28 = select i1 %cmp13, i16 %d.043, i16 %4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp = icmp slt i64 %indvars.iv.next, %3 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + %c.0.lcssa = phi i16 [ undef, %entry ], [ %c.0., %for.body ] + %d.0.lcssa = phi i16 [ undef, %entry ], [ %.sink28, %for.body ] + %cmp26 = icmp sgt i16 %c.0.lcssa, %d.0.lcssa + %conv27 = zext i1 %cmp26 to i32 + ret i32 %conv27 +} + +attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +!llvm.ident = !{!0} + +!0 = !{!"clang"} +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} +!5 = !{!6, !6, i64 0} +!6 = !{!"any pointer", !3, i64 0} +!7 = !{!8, !8, i64 0} +!8 = !{!"short", !3, i64 0} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll new file mode 100644 index 00000000000..c51c6c98ddf --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll @@ -0,0 +1,153 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-vectorize -S -mtriple=aarch64-unknown-linux-gnu -force-vector-interleave=1 -force-vector-width=4 < %s | FileCheck %s + +; The test checks that there is no assert caused by issue described in PR36032 + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + +%struct.anon = type { i8 } + +@c = local_unnamed_addr global [6 x i8] zeroinitializer, align 1 +@b = internal global %struct.anon zeroinitializer, align 1 + +; Function Attrs: noreturn nounwind +define void @_Z1dv() local_unnamed_addr #0 { +; CHECK-LABEL: @_Z1dv( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = tail call i8* @"_ZN3$_01aEv"(%struct.anon* nonnull @b) +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, i8* [[CALL]], i64 4 +; CHECK-NEXT: br label [[FOR_COND:%.*]] +; CHECK: for.cond: +; CHECK-NEXT: [[F_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD5:%.*]], [[FOR_COND_CLEANUP:%.*]] ] +; CHECK-NEXT: [[G_0:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[G_1_LCSSA:%.*]], [[FOR_COND_CLEANUP]] ] +; CHECK-NEXT: [[CMP12:%.*]] = icmp ult i32 [[G_0]], 4 +; CHECK-NEXT: [[CONV:%.*]] = and i32 [[F_0]], 65535 +; CHECK-NEXT: br i1 [[CMP12]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[G_0]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 4, [[TMP0]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK: vector.scevcheck: +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 3, [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[G_0]], [[CONV]] +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP4]]) +; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP3]], [[MUL_RESULT]] +; CHECK-NEXT: [[TMP6:%.*]] = sub i32 [[TMP3]], [[MUL_RESULT]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 false, i1 [[TMP7]], i1 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[TMP2]], 4294967295 +; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[TMP13:%.*]] = or i1 false, [[TMP12]] +; CHECK-NEXT: br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[CALL]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[G_0]], [[CONV]] +; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = sub i64 [[TMP15]], [[TMP0]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @c, i64 0, i64 4), i64 [[TMP16]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP2]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = add i64 [[TMP0]], [[INDEX]] +; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[OFFSET_IDX4]] to i32 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> undef, i32 [[TMP18]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION7:%.*]] = add <4 x i32> [[BROADCAST_SPLAT6]], +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[CONV]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP20]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TMP22]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP23]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1, !alias.scope !0 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[TMP17]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[TMP25]], i32 0 +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP27]], align 1, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_BODY_LR_PH]] ], [ [[TMP0]], [[VECTOR_SCEVCHECK]] ], [ [[TMP0]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[G_1_LCSSA]] = phi i32 [ [[G_0]], [[FOR_COND]] ], [ 4, [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[ADD5]] = add nuw nsw i32 [[CONV]], 4 +; CHECK-NEXT: br label [[FOR_COND]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP29:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[CONV]], [[TMP29]] +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[ADD]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP30:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i8 [[TMP30]], i8* [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !7 +; +entry: + %call = tail call i8* @"_ZN3$_01aEv"(%struct.anon* nonnull @b) #2 + br label %for.cond + +for.cond: ; preds = %for.cond.cleanup, %entry + %f.0 = phi i32 [ 0, %entry ], [ %add5, %for.cond.cleanup ] + %g.0 = phi i32 [ undef, %entry ], [ %g.1.lcssa, %for.cond.cleanup ] + %cmp12 = icmp ult i32 %g.0, 4 + %conv = and i32 %f.0, 65535 + br i1 %cmp12, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %for.cond + %0 = zext i32 %g.0 to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %for.cond + %g.1.lcssa = phi i32 [ %g.0, %for.cond ], [ 4, %for.cond.cleanup.loopexit ] + %add5 = add nuw nsw i32 %conv, 4 + br label %for.cond + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ %0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %1 = trunc i64 %indvars.iv to i32 + %add = add i32 %conv, %1 + %idxprom = zext i32 %add to i64 + %arrayidx = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 %idxprom + %2 = load i8, i8* %arrayidx, align 1 + %arrayidx3 = getelementptr inbounds i8, i8* %call, i64 %indvars.iv + store i8 %2, i8* %arrayidx3, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 4 + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +declare i8* @"_ZN3$_01aEv"(%struct.anon*) local_unnamed_addr #1 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll new file mode 100644 index 00000000000..b0ebb4edf2a --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -0,0 +1,231 @@ +; REQUIRES: asserts +; RUN: opt < %s -force-vector-width=2 -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; Check predication-related cost calculations, including scalarization overhead +; and block probability scaling. Note that the functionality being tested is +; not specific to AArch64. We specify a target to get actual values for the +; instruction costs. + +; CHECK-LABEL: predicated_udiv +; +; This test checks that we correctly compute the cost of the predicated udiv +; instruction. If we assume the block probability is 50%, we compute the cost +; as: +; +; Cost of udiv: +; (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 +; +; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; +define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp0, align 4 + %tmp3 = load i32, i32* %tmp1, align 4 + br i1 %c, label %if.then, label %for.inc + +if.then: + %tmp4 = udiv i32 %tmp2, %tmp3 + br label %for.inc + +for.inc: + %tmp5 = phi i32 [ %tmp3, %for.body ], [ %tmp4, %if.then] + %tmp6 = add i32 %r, %tmp5 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %tmp7 = phi i32 [ %tmp6, %for.inc ] + ret i32 %tmp7 +} + +; CHECK-LABEL: predicated_store +; +; This test checks that we correctly compute the cost of the predicated store +; instruction. If we assume the block probability is 50%, we compute the cost +; as: +; +; Cost of store: +; (store(4) + extractelement(3)) / 2 = 3 +; +; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 +; +define void @predicated_store(i32* %a, i1 %c, i32 %x, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp1 = load i32, i32* %tmp0, align 4 + %tmp2 = add nsw i32 %tmp1, %x + br i1 %c, label %if.then, label %for.inc + +if.then: + store i32 %tmp2, i32* %tmp0, align 4 + br label %for.inc + +for.inc: + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: predicated_udiv_scalarized_operand +; +; This test checks that we correctly compute the cost of the predicated udiv +; instruction and the add instruction it uses. The add is scalarized and sunk +; inside the predicated block. If we assume the block probability is 50%, we +; compute the cost as: +; +; Cost of add: +; (add(2) + extractelement(3)) / 2 = 2 +; Cost of udiv: +; (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4 +; +; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x +; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x +; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; +define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp2 = load i32, i32* %tmp0, align 4 + br i1 %c, label %if.then, label %for.inc + +if.then: + %tmp3 = add nsw i32 %tmp2, %x + %tmp4 = udiv i32 %tmp2, %tmp3 + br label %for.inc + +for.inc: + %tmp5 = phi i32 [ %tmp2, %for.body ], [ %tmp4, %if.then] + %tmp6 = add i32 %r, %tmp5 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %tmp7 = phi i32 [ %tmp6, %for.inc ] + ret i32 %tmp7 +} + +; CHECK-LABEL: predicated_store_scalarized_operand +; +; This test checks that we correctly compute the cost of the predicated store +; instruction and the add instruction it uses. The add is scalarized and sunk +; inside the predicated block. If we assume the block probability is 50%, we +; compute the cost as: +; +; Cost of add: +; (add(2) + extractelement(3)) / 2 = 2 +; Cost of store: +; store(4) / 2 = 2 +; +; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x +; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 +; +define void @predicated_store_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp1 = load i32, i32* %tmp0, align 4 + br i1 %c, label %if.then, label %for.inc + +if.then: + %tmp2 = add nsw i32 %tmp1, %x + store i32 %tmp2, i32* %tmp0, align 4 + br label %for.inc + +for.inc: + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: predication_multi_context +; +; This test checks that we correctly compute the cost of multiple predicated +; instructions in the same block. The sdiv, udiv, and store must be scalarized +; and predicated. The sub feeding the store is scalarized and sunk inside the +; store's predicated block. However, the add feeding the sdiv and udiv cannot +; be sunk and is not scalarized. If we assume the block probability is 50%, we +; compute the cost as: +; +; Cost of add: +; add(1) = 1 +; Cost of sdiv: +; (sdiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 +; Cost of udiv: +; (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 +; Cost of sub: +; (sub(2) + extractelement(3)) / 2 = 2 +; Cost of store: +; store(4) / 2 = 2 +; +; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x +; CHECK: Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2 +; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2 +; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x +; CHECK: Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4 +; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x +; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2 +; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2 +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4 +; +define void @predication_multi_context(i32* %a, i1 %c, i32 %x, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp1 = load i32, i32* %tmp0, align 4 + br i1 %c, label %if.then, label %for.inc + +if.then: + %tmp2 = add i32 %tmp1, %x + %tmp3 = sdiv i32 %tmp1, %tmp2 + %tmp4 = udiv i32 %tmp3, %tmp2 + %tmp5 = sub i32 %tmp4, %x + store i32 %tmp5, i32* %tmp0, align 4 + br label %for.inc + +for.inc: + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll new file mode 100644 index 00000000000..9d9aea00e9a --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll @@ -0,0 +1,171 @@ +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: @reduction_i8 +; +; char reduction_i8(char *a, char *b, int n) { +; char sum = 0; +; for (int i = 0; i < n; ++i) +; sum += (a[i] + b[i]); +; return sum; +; } +; +; CHECK: vector.body: +; CHECK: phi <16 x i8> +; CHECK: load <16 x i8> +; CHECK: load <16 x i8> +; CHECK: add <16 x i8> +; CHECK: add <16 x i8> +; +; CHECK: middle.block: +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> +; CHECK: zext i8 [[Rdx]] to i32 +; +define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { +entry: + %cmp.12 = icmp sgt i32 %n, 0 + br i1 %cmp.12, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.for.cond.cleanup_crit_edge: + %add5.lcssa = phi i32 [ %add5, %for.body ] + %conv6 = trunc i32 %add5.lcssa to i8 + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i8 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ] + ret i8 %sum.0.lcssa + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %sum.013 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %0 to i32 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2, align 1 + %conv3 = zext i8 %1 to i32 + %conv4 = and i32 %sum.013, 255 + %add = add nuw nsw i32 %conv, %conv4 + %add5 = add nuw nsw i32 %add, %conv3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body +} + +; CHECK-LABEL: @reduction_i16_1 +; +; short reduction_i16_1(short *a, short *b, int n) { +; short sum = 0; +; for (int i = 0; i < n; ++i) +; sum += (a[i] + b[i]); +; return sum; +; } +; +; CHECK: vector.body: +; CHECK: phi <8 x i16> +; CHECK: load <8 x i16> +; CHECK: load <8 x i16> +; CHECK: add <8 x i16> +; CHECK: add <8 x i16> +; +; CHECK: middle.block: +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> +; CHECK: zext i16 [[Rdx]] to i32 +; +define i16 @reduction_i16_1(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %n) { +entry: + %cmp.16 = icmp sgt i32 %n, 0 + br i1 %cmp.16, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.for.cond.cleanup_crit_edge: + %add5.lcssa = phi i32 [ %add5, %for.body ] + %conv6 = trunc i32 %add5.lcssa to i16 + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i16 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ] + ret i16 %sum.0.lcssa + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %sum.017 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %a, i64 %indvars.iv + %0 = load i16, i16* %arrayidx, align 2 + %conv.14 = zext i16 %0 to i32 + %arrayidx2 = getelementptr inbounds i16, i16* %b, i64 %indvars.iv + %1 = load i16, i16* %arrayidx2, align 2 + %conv3.15 = zext i16 %1 to i32 + %conv4.13 = and i32 %sum.017, 65535 + %add = add nuw nsw i32 %conv.14, %conv4.13 + %add5 = add nuw nsw i32 %add, %conv3.15 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body +} + +; CHECK-LABEL: @reduction_i16_2 +; +; short reduction_i16_2(char *a, char *b, int n) { +; short sum = 0; +; for (int i = 0; i < n; ++i) +; sum += (a[i] + b[i]); +; return sum; +; } +; +; CHECK: vector.body: +; CHECK: phi <8 x i16> +; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8> +; CHECK: zext <8 x i8> [[Ld1]] to <8 x i16> +; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8> +; CHECK: zext <8 x i8> [[Ld2]] to <8 x i16> +; CHECK: add <8 x i16> +; CHECK: add <8 x i16> +; +; CHECK: middle.block: +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> +; CHECK: zext i16 [[Rdx]] to i32 +; +define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { +entry: + %cmp.14 = icmp sgt i32 %n, 0 + br i1 %cmp.14, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.for.cond.cleanup_crit_edge: + %add5.lcssa = phi i32 [ %add5, %for.body ] + %conv6 = trunc i32 %add5.lcssa to i16 + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i16 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ] + ret i16 %sum.0.lcssa + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %sum.015 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %0 to i32 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2, align 1 + %conv3 = zext i8 %1 to i32 + %conv4.13 = and i32 %sum.015, 65535 + %add = add nuw nsw i32 %conv, %conv4.13 + %add5 = add nuw nsw i32 %add, %conv3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sdiv-pow2.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sdiv-pow2.ll new file mode 100644 index 00000000000..f3c6548c68e --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sdiv-pow2.ll @@ -0,0 +1,31 @@ +; RUN: opt < %s -loop-vectorize -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a57 -S | FileCheck %s +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +%struct.anon = type { [100 x i32], i32, [100 x i32] } + +@Foo = common global %struct.anon zeroinitializer, align 4 + +; CHECK-LABEL: @foo( +; CHECK: load <4 x i32>, <4 x i32>* +; CHECK: sdiv <4 x i32> +; CHECK: store <4 x i32> + +define void @foo(){ +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds %struct.anon, %struct.anon* @Foo, i64 0, i32 2, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %div = sdiv i32 %0, 2 + %arrayidx2 = getelementptr inbounds %struct.anon, %struct.anon* @Foo, i64 0, i32 0, i64 %indvars.iv + store i32 %div, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll new file mode 100644 index 00000000000..1ae7dadeffd --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll @@ -0,0 +1,33 @@ +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: Checking a loop in "interleaved_access" +; CHECK: The Smallest and Widest types: 64 / 64 bits +; +define void @interleaved_access(i8** %A, i64 %N) { +for.ph: + br label %for.body + +for.body: + %i = phi i64 [ %i.next.3, %for.body ], [ 0, %for.ph ] + %tmp0 = getelementptr inbounds i8*, i8** %A, i64 %i + store i8* null, i8** %tmp0, align 8 + %i.next.0 = add nuw nsw i64 %i, 1 + %tmp1 = getelementptr inbounds i8*, i8** %A, i64 %i.next.0 + store i8* null, i8** %tmp1, align 8 + %i.next.1 = add nsw i64 %i, 2 + %tmp2 = getelementptr inbounds i8*, i8** %A, i64 %i.next.1 + store i8* null, i8** %tmp2, align 8 + %i.next.2 = add nsw i64 %i, 3 + %tmp3 = getelementptr inbounds i8*, i8** %A, i64 %i.next.2 + store i8* null, i8** %tmp3, align 8 + %i.next.3 = add nsw i64 %i, 4 + %cond = icmp slt i64 %i.next.3, %N + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll new file mode 100644 index 00000000000..ffe8480138d --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll @@ -0,0 +1,47 @@ +; RUN: opt -S < %s -loop-vectorize -force-vector-width=4 | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: test0 +define void @test0(i16* noalias %M3) { +entry: + br label %if.then1165.us + +if.then1165.us: ; preds = %if.then1165.us, %entry + %indvars.iv1783 = phi i64 [ 0, %entry ], [ %indvars.iv.next1784, %if.then1165.us ] + %conv1177.us = zext i16 undef to i32 + %add1178.us = add nsw i32 %conv1177.us, undef + %conv1179.us = trunc i32 %add1178.us to i16 + %idxprom1181.us = ashr exact i64 undef, 32 + %arrayidx1185.us = getelementptr inbounds i16, i16* %M3, i64 %idxprom1181.us + store i16 %conv1179.us, i16* %arrayidx1185.us, align 2 + %indvars.iv.next1784 = add nuw nsw i64 %indvars.iv1783, 1 + %exitcond1785 = icmp eq i64 %indvars.iv.next1784, 16 + br i1 %exitcond1785, label %for.inc1286.loopexit, label %if.then1165.us + +for.inc1286.loopexit: ; preds = %if.then1165.us + ret void +} + +; CHECK-LABEL: test1 +define void @test1(i16* noalias %M3) { +entry: + br label %if.then1165.us + +if.then1165.us: ; preds = %if.then1165.us, %entry + %indvars.iv1783 = phi i64 [ 0, %entry ], [ %indvars.iv.next1784, %if.then1165.us ] + %fptr = load i32, i32* undef, align 4 + %conv1177.us = zext i16 undef to i32 + %add1178.us = add nsw i32 %conv1177.us, %fptr + %conv1179.us = trunc i32 %add1178.us to i16 + %idxprom1181.us = ashr exact i64 undef, 32 + %arrayidx1185.us = getelementptr inbounds i16, i16* %M3, i64 %idxprom1181.us + store i16 %conv1179.us, i16* %arrayidx1185.us, align 2 + %indvars.iv.next1784 = add nuw nsw i64 %indvars.iv1783, 1 + %exitcond1785 = icmp eq i64 %indvars.iv.next1784, 16 + br i1 %exitcond1785, label %for.inc1286.loopexit, label %if.then1165.us + +for.inc1286.loopexit: ; preds = %if.then1165.us + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/divergent-runtime-check.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/divergent-runtime-check.ll new file mode 100644 index 00000000000..91a916798c5 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/divergent-runtime-check.ll @@ -0,0 +1,29 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -loop-vectorize -simplifycfg < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -loop-vectorize -pass-remarks-analysis='loop-vectorize' < %s 2>&1 | FileCheck -check-prefixes=REMARK %s + +; GCN-LABEL: @runtime_check_divergent_target( +; GCN-NOT: load <2 x half> +; GCN-NOT: store <2 x half> + +; REMARK: remark: :0:0: loop not vectorized: runtime pointer checks needed. Not enabled for divergent target +define amdgpu_kernel void @runtime_check_divergent_target(half addrspace(1)* nocapture %a, half addrspace(1)* nocapture %b) #0 { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds half, half addrspace(1)* %b, i64 %indvars.iv + %load = load half, half addrspace(1)* %arrayidx, align 4 + %mul = fmul half %load, 3.0 + %arrayidx2 = getelementptr inbounds half, half addrspace(1)* %a, i64 %indvars.iv + store half %mul, half addrspace(1)* %arrayidx2, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg new file mode 100644 index 00000000000..2a665f06be7 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll new file mode 100644 index 00000000000..832843983eb --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll @@ -0,0 +1,34 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=CIVI -check-prefix=GCN %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=CIVI -check-prefix=GCN %s + +; GCN-LABEL: @vectorize_v2f16_loop( +; GFX9: vector.body: +; GFX9: phi <2 x half> +; GFX9: load <2 x half> +; GFX9: fadd fast <2 x half> + +; GFX9: middle.block: +; GFX9: fadd fast <2 x half> + +; VI: phi half +; VI: phi load half +; VI: fadd fast half +define half @vectorize_v2f16_loop(half addrspace(1)* noalias %s) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %q.04 = phi half [ 0.0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds half, half addrspace(1)* %s, i64 %indvars.iv + %0 = load half, half addrspace(1)* %arrayidx, align 2 + %add = fadd fast half %q.04, %0 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 256 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + %add.lcssa = phi half [ %add, %for.body ] + ret half %add.lcssa +} diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll new file mode 100644 index 00000000000..f303ed5377e --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll @@ -0,0 +1,28 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji -loop-vectorize < %s | FileCheck %s + + +; For AMDGPU, loop unroll in loop vectorizer is disabled when VF==1. +; +; CHECK-LABEL: @small_loop( +; CHECK: store i32 +; CHECK-NOT: store i32 +; CHECK: ret +define amdgpu_kernel void @small_loop(i32* nocapture %inArray, i32 %size) nounwind { +entry: + %0 = icmp sgt i32 %size, 0 + br i1 %0, label %loop, label %exit + +loop: ; preds = %entry, %loop + %iv = phi i32 [ %iv1, %loop ], [ 0, %entry ] + %1 = getelementptr inbounds i32, i32* %inArray, i32 %iv + %2 = load i32, i32* %1, align 4 + %3 = add nsw i32 %2, 6 + store i32 %3, i32* %1, align 4 + %iv1 = add i32 %iv, 1 +; %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %cond = icmp eq i32 %iv1, %size + br i1 %cond, label %exit, label %loop + +exit: ; preds = %loop, %entry + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll new file mode 100644 index 00000000000..369568f6dfa --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll @@ -0,0 +1,330 @@ +; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX +; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX +; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN +; REQUIRES: asserts + +; Testing the ability of the loop vectorizer to tell when SIMD is safe or not +; regarding IEEE 754 standard. +; On Linux, we only want the vectorizer to work when -ffast-math flag is set, +; because NEON is not IEEE compliant. +; Darwin, on the other hand, doesn't support subnormals, and all optimizations +; are allowed, even without -ffast-math. + +; Integer loops are always vectorizeable +; CHECK: Checking a loop in "sumi" +; CHECK: We can vectorize this loop! +define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06 + store i32 %mul, i32* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +; Floating-point loops need fast-math to be vectorizeable +; LINUX: Checking a loop in "sumf" +; LINUX: Potentially unsafe FP op prevents vectorization +; DARWIN: Checking a loop in "sumf" +; DARWIN: We can vectorize this loop! +define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06 + %1 = load float, float* %arrayidx1, align 4 + %mul = fmul float %0, %1 + %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06 + store float %mul, float* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +; Integer loops are always vectorizeable +; CHECK: Checking a loop in "redi" +; CHECK: We can vectorize this loop! +define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, %Red.06 + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + %add.lcssa = phi i32 [ %add, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] + ret i32 %Red.0.lcssa +} + +; Floating-point loops need fast-math to be vectorizeable +; LINUX: Checking a loop in "redf" +; LINUX: Potentially unsafe FP op prevents vectorization +; DARWIN: Checking a loop in "redf" +; DARWIN: We can vectorize this loop! +define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07 + %1 = load float, float* %arrayidx1, align 4 + %mul = fmul float %0, %1 + %add = fadd float %Red.06, %mul + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + %add.lcssa = phi float [ %add, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] + ret float %Red.0.lcssa +} + +; Make sure calls that turn into builtins are also covered +; LINUX: Checking a loop in "fabs" +; LINUX: Potentially unsafe FP op prevents vectorization +; DARWIN: Checking a loop in "fabs" +; DARWIN: We can vectorize this loop! +define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { +entry: + %cmp10 = icmp eq i32 %N, 0 + br i1 %cmp10, label %for.end, label %for.body + +for.body: ; preds = %entry, %for.body + %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011 + %1 = load float, float* %arrayidx1, align 4 + %fabsf = tail call float @fabsf(float %1) #1 + %conv3 = fmul float %0, %fabsf + %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011 + store float %conv3, float* %arrayidx4, align 4 + %inc = add nuw nsw i32 %i.011, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; Integer loops are always vectorizeable +; CHECK: Checking a loop in "sumi_fast" +; CHECK: We can vectorize this loop! +define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06 + store i32 %mul, i32* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +; Floating-point loops can be vectorizeable with fast-math +; CHECK: Checking a loop in "sumf_fast" +; CHECK: We can vectorize this loop! +define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06 + %1 = load float, float* %arrayidx1, align 4 + %mul = fmul fast float %1, %0 + %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06 + store float %mul, float* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +; Integer loops are always vectorizeable +; CHECK: Checking a loop in "redi_fast" +; CHECK: We can vectorize this loop! +define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, %Red.06 + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + %add.lcssa = phi i32 [ %add, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] + ret i32 %Red.0.lcssa +} + +; Floating-point loops can be vectorizeable with fast-math +; CHECK: Checking a loop in "redf_fast" +; CHECK: We can vectorize this loop! +define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07 + %1 = load float, float* %arrayidx1, align 4 + %mul = fmul fast float %1, %0 + %add = fadd fast float %mul, %Red.06 + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + %add.lcssa = phi float [ %add, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] + ret float %Red.0.lcssa +} + +; Make sure calls that turn into builtins are also covered +; CHECK: Checking a loop in "fabs_fast" +; CHECK: We can vectorize this loop! +define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { +entry: + %cmp10 = icmp eq i32 %N, 0 + br i1 %cmp10, label %for.end, label %for.body + +for.body: ; preds = %entry, %for.body + %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011 + %1 = load float, float* %arrayidx1, align 4 + %fabsf = tail call fast float @fabsf(float %1) #2 + %conv3 = fmul fast float %fabsf, %0 + %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011 + store float %conv3, float* %arrayidx4, align 4 + %inc = add nuw nsw i32 %i.011, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +declare float @fabsf(float) + +attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll new file mode 100644 index 00000000000..7b09913636f --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll @@ -0,0 +1,71 @@ +; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFT +; RUN: opt < %s -loop-vectorize -force-vector-width=1 -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFTUNROLL + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +target triple = "thumbv7-apple-ios3.0.0" + +;CHECK-LABEL: @foo( +;CHECK: load <4 x i32> +;CHECK-NOT: load <4 x i32> +;CHECK: ret +;SWIFT-LABEL: @foo( +;SWIFT: load <4 x i32> +;SWIFT: load <4 x i32> +;SWIFT: ret +define i32 @foo(i32* nocapture %A, i32 %n) nounwind readonly ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ] + %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32, i32* %A, i32 %i.02 + %3 = load i32, i32* %2, align 4 + %4 = add nsw i32 %3, %sum.01 + %5 = add nsw i32 %i.02, 1 + %exitcond = icmp eq i32 %5, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ] + ret i32 %sum.0.lcssa +} + +; Verify the register limit. On arm we don't have 16 allocatable registers. +;SWIFTUNROLL-LABEL: @register_limit( +;SWIFTUNROLL: load i32 +;SWIFTUNROLL-NOT: load i32 +define i32 @register_limit(i32* nocapture %A, i32 %n) { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: + %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ] + %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ] + %sum.02 = phi i32 [ %6, %.lr.ph ], [ 0, %0 ] + %sum.03 = phi i32 [ %7, %.lr.ph ], [ 0, %0 ] + %sum.04 = phi i32 [ %8, %.lr.ph ], [ 0, %0 ] + %sum.05 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] + %sum.06 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32, i32* %A, i32 %i.02 + %3 = load i32, i32* %2, align 4 + %4 = add nsw i32 %3, %sum.01 + %5 = add nsw i32 %i.02, 1 + %6 = add nsw i32 %3, %sum.02 + %7 = add nsw i32 %3, %sum.03 + %8 = add nsw i32 %3, %sum.04 + %9 = add nsw i32 %3, %sum.05 + %10 = add nsw i32 %3, %sum.05 + %exitcond = icmp eq i32 %5, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ] + %sum.1.lcssa = phi i32 [ 0, %0 ], [ %6, %.lr.ph ] + %sum.2.lcssa = phi i32 [ 0, %0 ], [ %7, %.lr.ph ] + %sum.4.lcssa = phi i32 [ 0, %0 ], [ %8, %.lr.ph ] + %sum.5.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] + %sum.6.lcssa = phi i32 [ 0, %0 ], [ %10, %.lr.ph ] + ret i32 %sum.0.lcssa +} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll new file mode 100644 index 00000000000..6d1fa6f36a9 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll @@ -0,0 +1,88 @@ +; RUN: opt -loop-vectorize -mtriple=thumbv7s-apple-ios6.0.0 -S -enable-interleaved-mem-accesses=false < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" + +@kernel = global [512 x float] zeroinitializer, align 4 +@kernel2 = global [512 x float] zeroinitializer, align 4 +@kernel3 = global [512 x float] zeroinitializer, align 4 +@kernel4 = global [512 x float] zeroinitializer, align 4 +@src_data = global [1536 x float] zeroinitializer, align 4 +@r_ = global i8 0, align 4 +@g_ = global i8 0, align 4 +@b_ = global i8 0, align 4 + +; We don't want to vectorize most loops containing gathers because they are +; expensive. This function represents a point where vectorization starts to +; become beneficial. +; Make sure we are conservative and don't vectorize it. +; CHECK-NOT: <2 x float> +; CHECK-NOT: <4 x float> + +define void @_Z4testmm(i32 %size, i32 %offset) { +entry: + %cmp53 = icmp eq i32 %size, 0 + br i1 %cmp53, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: + br label %for.body + +for.body: + %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ] + %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ] + %v.055 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ] + %add = add i32 %v.055, %offset + %mul = mul i32 %add, 3 + %arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %mul + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i32 0, i32 %v.055 + %1 = load float, float* %arrayidx2, align 4 + %mul3 = fmul fast float %0, %1 + %arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i32 0, i32 %v.055 + %2 = load float, float* %arrayidx4, align 4 + %mul5 = fmul fast float %mul3, %2 + %arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i32 0, i32 %v.055 + %3 = load float, float* %arrayidx6, align 4 + %mul7 = fmul fast float %mul5, %3 + %arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i32 0, i32 %v.055 + %4 = load float, float* %arrayidx8, align 4 + %mul9 = fmul fast float %mul7, %4 + %add10 = fadd fast float %r.057, %mul9 + %arrayidx.sum = add i32 %mul, 1 + %arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum + %5 = load float, float* %arrayidx11, align 4 + %mul13 = fmul fast float %1, %5 + %mul15 = fmul fast float %2, %mul13 + %mul17 = fmul fast float %3, %mul15 + %mul19 = fmul fast float %4, %mul17 + %add20 = fadd fast float %g.056, %mul19 + %arrayidx.sum52 = add i32 %mul, 2 + %arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum52 + %6 = load float, float* %arrayidx21, align 4 + %mul23 = fmul fast float %1, %6 + %mul25 = fmul fast float %2, %mul23 + %mul27 = fmul fast float %3, %mul25 + %mul29 = fmul fast float %4, %mul27 + %add30 = fadd fast float %b.054, %mul29 + %inc = add i32 %v.055, 1 + %exitcond = icmp ne i32 %inc, %size + br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: + %add30.lcssa = phi float [ %add30, %for.body ] + %add20.lcssa = phi float [ %add20, %for.body ] + %add10.lcssa = phi float [ %add10, %for.body ] + %phitmp = fptoui float %add10.lcssa to i8 + %phitmp60 = fptoui float %add20.lcssa to i8 + %phitmp61 = fptoui float %add30.lcssa to i8 + br label %for.end + +for.end: + %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] + %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ] + %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ] + store i8 %r.0.lcssa, i8* @r_, align 4 + store i8 %g.0.lcssa, i8* @g_, align 4 + store i8 %b.0.lcssa, i8* @b_, align 4 + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll b/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll new file mode 100644 index 00000000000..783156d7399 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll @@ -0,0 +1,60 @@ +; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S -dce | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +target triple = "thumbv7-apple-ios3.0.0" + +@b = common global [2048 x i32] zeroinitializer, align 16 +@c = common global [2048 x i32] zeroinitializer, align 16 +@a = common global [2048 x i32] zeroinitializer, align 16 + +; Select VF = 8; +;CHECK-LABEL: @example1( +;CHECK: load <4 x i32> +;CHECK: add nsw <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret void +define void @example1() nounwind uwtable ssp { + br label %1 + +;